tracy_rpmalloc.cpp (70419B)
1 #ifdef TRACY_ENABLE 2 3 /* rpmalloc.c - Memory allocator - Public Domain - 2016 Mattias Jansson / Rampant Pixels 4 * 5 * This library provides a cross-platform lock free thread caching malloc implementation in C11. 6 * The latest source code is always available at 7 * 8 * https://github.com/rampantpixels/rpmalloc 9 * 10 * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. 11 * 12 */ 13 14 #include "tracy_rpmalloc.hpp" 15 16 /// Build time configurable limits 17 #ifndef HEAP_ARRAY_SIZE 18 //! Size of heap hashmap 19 #define HEAP_ARRAY_SIZE 79 20 #endif 21 #ifndef ENABLE_THREAD_CACHE 22 //! Enable per-thread cache 23 #define ENABLE_THREAD_CACHE 1 24 #endif 25 #ifndef ENABLE_GLOBAL_CACHE 26 //! Enable global cache shared between all threads, requires thread cache 27 #define ENABLE_GLOBAL_CACHE 1 28 #endif 29 #ifndef ENABLE_VALIDATE_ARGS 30 //! Enable validation of args to public entry points 31 #define ENABLE_VALIDATE_ARGS 0 32 #endif 33 #ifndef ENABLE_STATISTICS 34 //! Enable statistics collection 35 #define ENABLE_STATISTICS 0 36 #endif 37 #ifndef ENABLE_ASSERTS 38 //! Enable asserts 39 #define ENABLE_ASSERTS 0 40 #endif 41 #ifndef ENABLE_PRELOAD 42 //! Support preloading 43 #define ENABLE_PRELOAD 0 44 #endif 45 #ifndef ENABLE_GUARDS 46 //! Enable overwrite/underwrite guards 47 #define ENABLE_GUARDS 0 48 #endif 49 #ifndef ENABLE_UNLIMITED_CACHE 50 //! Unlimited cache disables any cache limitations 51 #define ENABLE_UNLIMITED_CACHE 0 52 #endif 53 #ifndef DEFAULT_SPAN_MAP_COUNT 54 //! Default number of spans to map in call to map more virtual memory 55 #define DEFAULT_SPAN_MAP_COUNT 16 56 #endif 57 //! Minimum cache size to remain after a release to global cache 58 #define MIN_SPAN_CACHE_SIZE 64 59 //! Minimum number of spans to transfer between thread and global cache 60 #define MIN_SPAN_CACHE_RELEASE 16 61 //! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor) 62 #define MAX_SPAN_CACHE_DIVISOR 4 63 //! Minimum cache size to remain after a release to global cache, large spans 64 #define MIN_LARGE_SPAN_CACHE_SIZE 8 65 //! Minimum number of spans to transfer between thread and global cache, large spans 66 #define MIN_LARGE_SPAN_CACHE_RELEASE 4 67 //! Maximum cache size divisor, large spans (max cache size will be max allocation count divided by this divisor) 68 #define MAX_LARGE_SPAN_CACHE_DIVISOR 16 69 //! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this) 70 #define MAX_GLOBAL_CACHE_MULTIPLIER 8 71 72 #if !ENABLE_THREAD_CACHE 73 # undef ENABLE_GLOBAL_CACHE 74 # define ENABLE_GLOBAL_CACHE 0 75 # undef MIN_SPAN_CACHE_SIZE 76 # undef MIN_SPAN_CACHE_RELEASE 77 # undef MAX_SPAN_CACHE_DIVISOR 78 # undef MIN_LARGE_SPAN_CACHE_SIZE 79 # undef MIN_LARGE_SPAN_CACHE_RELEASE 80 # undef MAX_LARGE_SPAN_CACHE_DIVISOR 81 #endif 82 #if !ENABLE_GLOBAL_CACHE 83 # undef MAX_GLOBAL_CACHE_MULTIPLIER 84 #endif 85 86 /// Platform and arch specifics 87 #ifdef _MSC_VER 88 # pragma warning( push ) 89 # pragma warning( disable : 4324 ) 90 # define ALIGNED_STRUCT(name, alignment) __declspec(align(alignment)) struct name 91 # define FORCEINLINE __forceinline 92 # define atomic_thread_fence_acquire() //_ReadWriteBarrier() 93 # define atomic_thread_fence_release() //_ReadWriteBarrier() 94 # if ENABLE_VALIDATE_ARGS 95 # include <Intsafe.h> 96 # endif 97 # include <intrin.h> 98 #else 99 # include <unistd.h> 100 # if defined(__APPLE__) && ENABLE_PRELOAD 101 # include <pthread.h> 102 # endif 103 # define ALIGNED_STRUCT(name, alignment) struct __attribute__((__aligned__(alignment))) name 104 # ifdef FORCEINLINE 105 # undef FORCEINLINE 106 # endif 107 # define FORCEINLINE inline __attribute__((__always_inline__)) 108 # ifdef __arm__ 109 # define atomic_thread_fence_acquire() __asm volatile("dmb ish" ::: "memory") 110 # define atomic_thread_fence_release() __asm volatile("dmb ishst" ::: "memory") 111 # else 112 # define atomic_thread_fence_acquire() //__asm volatile("" ::: "memory") 113 # define atomic_thread_fence_release() //__asm volatile("" ::: "memory") 114 # endif 115 #endif 116 117 #if defined( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( _AMD64_ ) || defined( __arm64__ ) || defined( __aarch64__ ) 118 # define ARCH_64BIT 1 119 #else 120 # define ARCH_64BIT 0 121 #endif 122 123 #if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) 124 # define PLATFORM_WINDOWS 1 125 # define PLATFORM_POSIX 0 126 #else 127 # define PLATFORM_WINDOWS 0 128 # define PLATFORM_POSIX 1 129 #endif 130 131 #include <stdint.h> 132 #include <string.h> 133 134 #include <assert.h> 135 136 #if ENABLE_GUARDS 137 # define MAGIC_GUARD 0xDEADBAAD 138 #endif 139 140 namespace tracy 141 { 142 143 /// Atomic access abstraction 144 ALIGNED_STRUCT(atomic32_t, 4) { 145 volatile int32_t nonatomic; 146 }; 147 typedef struct atomic32_t atomic32_t; 148 149 ALIGNED_STRUCT(atomic64_t, 8) { 150 volatile int64_t nonatomic; 151 }; 152 typedef struct atomic64_t atomic64_t; 153 154 ALIGNED_STRUCT(atomicptr_t, 8) { 155 volatile void* nonatomic; 156 }; 157 typedef struct atomicptr_t atomicptr_t; 158 159 static FORCEINLINE int32_t 160 atomic_load32(atomic32_t* src) { 161 return src->nonatomic; 162 } 163 164 static FORCEINLINE void 165 atomic_store32(atomic32_t* dst, int32_t val) { 166 dst->nonatomic = val; 167 } 168 169 static FORCEINLINE int32_t 170 atomic_incr32(atomic32_t* val) { 171 #ifdef _MSC_VER 172 int32_t old = (int32_t)_InterlockedExchangeAdd((volatile long*)&val->nonatomic, 1); 173 return (old + 1); 174 #else 175 return __sync_add_and_fetch(&val->nonatomic, 1); 176 #endif 177 } 178 179 static FORCEINLINE int32_t 180 atomic_add32(atomic32_t* val, int32_t add) { 181 #ifdef _MSC_VER 182 int32_t old = (int32_t)_InterlockedExchangeAdd((volatile long*)&val->nonatomic, add); 183 return (old + add); 184 #else 185 return __sync_add_and_fetch(&val->nonatomic, add); 186 #endif 187 } 188 189 static FORCEINLINE void* 190 atomic_load_ptr(atomicptr_t* src) { 191 return (void*)((uintptr_t)src->nonatomic); 192 } 193 194 static FORCEINLINE void 195 atomic_store_ptr(atomicptr_t* dst, void* val) { 196 dst->nonatomic = val; 197 } 198 199 static FORCEINLINE int 200 atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { 201 #ifdef _MSC_VER 202 # if ARCH_64BIT 203 return (_InterlockedCompareExchange64((volatile long long*)&dst->nonatomic, 204 (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; 205 # else 206 return (_InterlockedCompareExchange((volatile long*)&dst->nonatomic, 207 (long)val, (long)ref) == (long)ref) ? 1 : 0; 208 # endif 209 #else 210 return __sync_bool_compare_and_swap(&dst->nonatomic, ref, val); 211 #endif 212 } 213 214 /// Preconfigured limits and sizes 215 //! Granularity of a small allocation block 216 #define SMALL_GRANULARITY 32 217 //! Small granularity shift count 218 #define SMALL_GRANULARITY_SHIFT 5 219 //! Number of small block size classes 220 #define SMALL_CLASS_COUNT 63 221 //! Maximum size of a small block 222 #define SMALL_SIZE_LIMIT 2016 223 //! Granularity of a medium allocation block 224 #define MEDIUM_GRANULARITY 512 225 //! Medium granularity shift count 226 #define MEDIUM_GRANULARITY_SHIFT 9 227 //! Number of medium block size classes 228 #define MEDIUM_CLASS_COUNT 60 229 //! Total number of small + medium size classes 230 #define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) 231 //! Number of large block size classes 232 #define LARGE_CLASS_COUNT 32 233 //! Maximum size of a medium block 234 #define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT) - SPAN_HEADER_SIZE) 235 //! Maximum size of a large block 236 #define LARGE_SIZE_LIMIT ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) 237 //! Size of a span header 238 #define SPAN_HEADER_SIZE 32 239 240 #define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs)) 241 #define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second)) 242 243 #if ARCH_64BIT 244 typedef int64_t offset_t; 245 #else 246 typedef int32_t offset_t; 247 #endif 248 typedef uint32_t count_t; 249 250 #if ENABLE_VALIDATE_ARGS 251 //! Maximum allocation size to avoid integer overflow 252 #undef MAX_ALLOC_SIZE 253 #define MAX_ALLOC_SIZE (((size_t)-1) - _memory_span_size) 254 #endif 255 256 /// Data types 257 //! A memory heap, per thread 258 typedef struct heap_t heap_t; 259 //! Span of memory pages 260 typedef struct span_t span_t; 261 //! Size class definition 262 typedef struct size_class_t size_class_t; 263 //! Span block bookkeeping 264 typedef struct span_block_t span_block_t; 265 //! Span list bookkeeping 266 typedef struct span_list_t span_list_t; 267 //! Span data union, usage depending on span state 268 typedef union span_data_t span_data_t; 269 //! Cache data 270 typedef struct span_counter_t span_counter_t; 271 //! Global cache 272 typedef struct global_cache_t global_cache_t; 273 274 //! Flag indicating span is the first (master) span of a split superspan 275 #define SPAN_FLAG_MASTER 1 276 //! Flag indicating span is a secondary (sub) span of a split superspan 277 #define SPAN_FLAG_SUBSPAN 2 278 279 //Alignment offset must match in both structures to keep the data when 280 //transitioning between being used for blocks and being part of a list 281 struct span_block_t { 282 //! Free list 283 uint16_t free_list; 284 //! First autolinked block 285 uint16_t first_autolink; 286 //! Free count 287 uint16_t free_count; 288 //! Alignment offset 289 uint16_t align_offset; 290 }; 291 292 struct span_list_t { 293 //! List size 294 uint32_t size; 295 //! Unused in lists 296 uint16_t unused; 297 //! Alignment offset 298 uint16_t align_offset; 299 }; 300 301 union span_data_t { 302 //! Span data when used as blocks 303 span_block_t block; 304 //! Span data when used in lists 305 span_list_t list; 306 }; 307 308 //A span can either represent a single span of memory pages with size declared by span_map_count configuration variable, 309 //or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single 310 //span or a super span. A super span can further be diviced into multiple spans (or this, super spans), where the first 311 //(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans 312 //that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire 313 //superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released 314 //in the same call to release the virtual memory range, but individual subranges can be decommitted individually 315 //to reduce physical memory use). 316 struct span_t { 317 //! Heap ID 318 atomic32_t heap_id; 319 //! Size class 320 uint16_t size_class; 321 // TODO: If we could store remainder part of flags as an atomic counter, the entire check 322 // if master is owned by calling heap could be simplified to an atomic dec from any thread 323 // since remainder of a split super span only ever decreases, never increases 324 //! Flags and counters 325 uint16_t flags; 326 //! Span data 327 span_data_t data; 328 //! Next span 329 span_t* next_span; 330 //! Previous span 331 span_t* prev_span; 332 }; 333 static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); 334 335 //Adaptive cache counter of a single superspan span count 336 struct span_counter_t { 337 //! Allocation high water mark 338 uint32_t max_allocations; 339 //! Current number of allocations 340 uint32_t current_allocations; 341 //! Cache limit 342 uint32_t cache_limit; 343 }; 344 345 struct heap_t { 346 //! Heap ID 347 int32_t id; 348 //! Free count for each size class active span 349 span_block_t active_block[SIZE_CLASS_COUNT]; 350 //! Active span for each size class 351 span_t* active_span[SIZE_CLASS_COUNT]; 352 //! List of semi-used spans with free blocks for each size class (double linked list) 353 span_t* size_cache[SIZE_CLASS_COUNT]; 354 #if ENABLE_THREAD_CACHE 355 //! List of free spans (single linked list) 356 span_t* span_cache[LARGE_CLASS_COUNT]; 357 //! Allocation counters 358 span_counter_t span_counter[LARGE_CLASS_COUNT]; 359 #endif 360 //! Mapped but unused spans 361 span_t* span_reserve; 362 //! Master span for mapped but unused spans 363 span_t* span_reserve_master; 364 //! Number of mapped but unused spans 365 size_t spans_reserved; 366 //! Deferred deallocation 367 atomicptr_t defer_deallocate; 368 //! Deferred unmaps 369 atomicptr_t defer_unmap; 370 //! Next heap in id list 371 heap_t* next_heap; 372 //! Next heap in orphan list 373 heap_t* next_orphan; 374 //! Memory pages alignment offset 375 size_t align_offset; 376 #if ENABLE_STATISTICS 377 //! Number of bytes transitioned thread -> global 378 size_t thread_to_global; 379 //! Number of bytes transitioned global -> thread 380 size_t global_to_thread; 381 #endif 382 }; 383 384 struct size_class_t { 385 //! Size of blocks in this class 386 uint32_t size; 387 //! Number of blocks in each chunk 388 uint16_t block_count; 389 //! Class index this class is merged with 390 uint16_t class_idx; 391 }; 392 static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); 393 394 struct global_cache_t { 395 //! Cache list pointer 396 atomicptr_t cache; 397 //! Cache size 398 atomic32_t size; 399 //! ABA counter 400 atomic32_t counter; 401 }; 402 403 /// Global data 404 //! Configuration 405 static rpmalloc_config_t _memory_config; 406 //! Memory page size 407 static size_t _memory_page_size; 408 //! Shift to divide by page size 409 static size_t _memory_page_size_shift; 410 //! Mask to get to start of a memory page 411 static size_t _memory_page_mask; 412 //! Granularity at which memory pages are mapped by OS 413 static size_t _memory_map_granularity; 414 //! Size of a span of memory pages 415 static size_t _memory_span_size; 416 //! Shift to divide by span size 417 static size_t _memory_span_size_shift; 418 //! Mask to get to start of a memory span 419 static uintptr_t _memory_span_mask; 420 //! Global size classes 421 static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; 422 //! Run-time size limit of medium blocks 423 static size_t _memory_medium_size_limit; 424 //! Heap ID counter 425 static atomic32_t _memory_heap_id; 426 #if ENABLE_THREAD_CACHE 427 //! Adaptive cache max allocation count 428 static uint32_t _memory_max_allocation[LARGE_CLASS_COUNT]; 429 #endif 430 #if ENABLE_GLOBAL_CACHE 431 //! Global span cache 432 static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; 433 #endif 434 //! All heaps 435 static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE]; 436 //! Orphaned heaps 437 static atomicptr_t _memory_orphan_heaps; 438 //! Running orphan counter to avoid ABA issues in linked list 439 static atomic32_t _memory_orphan_counter; 440 //! Active heap count 441 static atomic32_t _memory_active_heaps; 442 #if ENABLE_STATISTICS 443 //! Total number of currently mapped memory pages 444 static atomic32_t _mapped_pages; 445 //! Total number of currently lost spans 446 static atomic32_t _reserved_spans; 447 //! Running counter of total number of mapped memory pages since start 448 static atomic32_t _mapped_total; 449 //! Running counter of total number of unmapped memory pages since start 450 static atomic32_t _unmapped_total; 451 #endif 452 453 #define MEMORY_UNUSED(x) (void)sizeof((x)) 454 455 //! Current thread heap 456 #if defined(__APPLE__) && ENABLE_PRELOAD 457 static pthread_key_t _memory_thread_heap; 458 #else 459 # ifdef _MSC_VER 460 # define _Thread_local __declspec(thread) 461 # define TLS_MODEL 462 # else 463 # define TLS_MODEL __attribute__((tls_model("initial-exec"))) 464 # if !defined(__clang__) && defined(__GNUC__) 465 # define _Thread_local __thread 466 # endif 467 # endif 468 static _Thread_local heap_t* _memory_thread_heap TLS_MODEL; 469 #endif 470 471 //! Get the current thread heap 472 static FORCEINLINE heap_t* 473 get_thread_heap(void) { 474 #if defined(__APPLE__) && ENABLE_PRELOAD 475 return pthread_getspecific(_memory_thread_heap); 476 #else 477 return _memory_thread_heap; 478 #endif 479 } 480 481 //! Set the current thread heap 482 static void 483 set_thread_heap(heap_t* heap) { 484 #if defined(__APPLE__) && ENABLE_PRELOAD 485 pthread_setspecific(_memory_thread_heap, heap); 486 #else 487 _memory_thread_heap = heap; 488 #endif 489 } 490 491 //! Default implementation to map more virtual memory 492 static void* 493 _memory_map_os(size_t size, size_t* offset); 494 495 //! Default implementation to unmap virtual memory 496 static void 497 _memory_unmap_os(void* address, size_t size, size_t offset, int release); 498 499 //! Deallocate any deferred blocks and check for the given size class 500 static int 501 _memory_deallocate_deferred(heap_t* heap, size_t size_class); 502 503 //! Lookup a memory heap from heap ID 504 static heap_t* 505 _memory_heap_lookup(int32_t id) { 506 uint32_t list_idx = id % HEAP_ARRAY_SIZE; 507 heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); 508 while (heap && (heap->id != id)) 509 heap = heap->next_heap; 510 return heap; 511 } 512 513 #if ENABLE_THREAD_CACHE 514 515 //! Increase an allocation counter 516 static void 517 _memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size_t span_count) { 518 if (++counter->current_allocations > counter->max_allocations) { 519 counter->max_allocations = counter->current_allocations; 520 const uint32_t cache_limit_max = (uint32_t)_memory_span_size - 2; 521 #if !ENABLE_UNLIMITED_CACHE 522 counter->cache_limit = counter->max_allocations / ((span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : MAX_LARGE_SPAN_CACHE_DIVISOR); 523 const uint32_t cache_limit_min = (span_count == 1) ? (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE) : (MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE); 524 if (counter->cache_limit < cache_limit_min) 525 counter->cache_limit = cache_limit_min; 526 if (counter->cache_limit > cache_limit_max) 527 counter->cache_limit = cache_limit_max; 528 #else 529 counter->cache_limit = cache_limit_max; 530 #endif 531 if (counter->max_allocations > *global_counter) 532 *global_counter = counter->max_allocations; 533 } 534 } 535 536 #else 537 # define _memory_counter_increase(counter, global_counter, span_count) do {} while (0) 538 #endif 539 540 #if ENABLE_STATISTICS 541 # define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value)) 542 # define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value)) 543 #else 544 # define _memory_statistics_add(atomic_counter, value) do {} while(0) 545 # define _memory_statistics_sub(atomic_counter, value) do {} while(0) 546 #endif 547 548 //! Map more virtual memory 549 static void* 550 _memory_map(size_t size, size_t* offset) { 551 assert(!(size % _memory_page_size)); 552 _memory_statistics_add(&_mapped_pages, (size >> _memory_page_size_shift)); 553 _memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift)); 554 return _memory_config.memory_map(size, offset); 555 } 556 557 //! Unmap virtual memory 558 static void 559 _memory_unmap(void* address, size_t size, size_t offset, int release) { 560 assert((size < _memory_span_size) || !((uintptr_t)address & ~_memory_span_mask)); 561 assert(!(size % _memory_page_size)); 562 _memory_statistics_sub(&_mapped_pages, (size >> _memory_page_size_shift)); 563 _memory_statistics_add(&_unmapped_total, (size >> _memory_page_size_shift)); 564 _memory_config.memory_unmap(address, size, offset, release); 565 } 566 567 //! Make flags field in a span from flags, remainder/distance and count 568 #define SPAN_MAKE_FLAGS(flags, remdist, count) ((uint16_t)((flags) | ((uint16_t)((remdist) - 1) << 2) | ((uint16_t)((count) - 1) << 9))); assert((flags) < 4); assert((remdist) && (remdist) < 128); assert((count) && (count) < 128) 569 //! Check if span has any of the given flags 570 #define SPAN_HAS_FLAG(flags, flag) ((flags) & (flag)) 571 //! Get the distance from flags field 572 #define SPAN_DISTANCE(flags) (1 + (((flags) >> 2) & 0x7f)) 573 //! Get the remainder from flags field 574 #define SPAN_REMAINS(flags) (1 + (((flags) >> 2) & 0x7f)) 575 //! Get the count from flags field 576 #define SPAN_COUNT(flags) (1 + (((flags) >> 9) & 0x7f)) 577 //! Set the remainder in the flags field (MUST be done from the owner heap thread) 578 #define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)(((flags) & 0xfe03) | ((uint16_t)((remains) - 1) << 2))); assert((remains) < 128) 579 580 //! Resize the given super span to the given count of spans, store the remainder in the heap reserved spans fields 581 static void 582 _memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_count) { 583 size_t current_count = SPAN_COUNT(span->flags); 584 585 assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); 586 assert((current_count > 1) && (current_count < 127)); 587 assert(!heap->spans_reserved); 588 assert((size_t)SPAN_COUNT(span->flags) == current_count); 589 assert(current_count > use_count); 590 591 heap->span_reserve = (span_t*)pointer_offset(span, use_count * _memory_span_size); 592 heap->spans_reserved = current_count - use_count; 593 if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) { 594 //We must store the heap id before setting as master, to force unmaps to defer to this heap thread 595 atomic_store32(&span->heap_id, heap->id); 596 atomic_thread_fence_release(); 597 heap->span_reserve_master = span; 598 span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count); 599 _memory_statistics_add(&_reserved_spans, current_count); 600 } 601 else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) { 602 //Only owner heap thread can modify a master span 603 assert(atomic_load32(&span->heap_id) == heap->id); 604 uint16_t remains = SPAN_REMAINS(span->flags); 605 assert(remains >= current_count); 606 heap->span_reserve_master = span; 607 span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count); 608 } 609 else { //SPAN_FLAG_SUBSPAN 610 //Resizing a subspan is a safe operation in any thread 611 uint16_t distance = SPAN_DISTANCE(span->flags); 612 span_t* master = (span_t*)pointer_offset(span, -(int)distance * (int)_memory_span_size); 613 heap->span_reserve_master = master; 614 assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER)); 615 assert((size_t)SPAN_REMAINS(master->flags) >= current_count); 616 span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count); 617 } 618 assert((SPAN_COUNT(span->flags) + heap->spans_reserved) == current_count); 619 } 620 621 //! Map in memory pages for the given number of spans (or use previously reserved pages) 622 static span_t* 623 _memory_map_spans(heap_t* heap, size_t span_count) { 624 if (span_count <= heap->spans_reserved) { 625 span_t* span = heap->span_reserve; 626 heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size); 627 heap->spans_reserved -= span_count; 628 //Declare the span to be a subspan with given distance from master span 629 uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) >> _memory_span_size_shift); 630 span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count); 631 span->data.block.align_offset = 0; 632 return span; 633 } 634 635 //We cannot request extra spans if we already have some (but not enough) pending reserved spans 636 size_t request_spans = (heap->spans_reserved || (span_count > _memory_config.span_map_count)) ? span_count : _memory_config.span_map_count; 637 size_t align_offset = 0; 638 span_t* span = (span_t*)_memory_map(request_spans * _memory_span_size, &align_offset); 639 span->flags = SPAN_MAKE_FLAGS(0, request_spans, request_spans); 640 span->data.block.align_offset = (uint16_t)align_offset; 641 if (request_spans > span_count) { 642 //We have extra spans, store them as reserved spans in heap 643 _memory_set_span_remainder_as_reserved(heap, span, span_count); 644 } 645 return span; 646 } 647 648 //! Defer unmapping of the given span to the owner heap 649 static int 650 _memory_unmap_defer(int32_t heap_id, span_t* span) { 651 //Get the heap and link in pointer in list of deferred operations 652 heap_t* heap = _memory_heap_lookup(heap_id); 653 if (!heap) 654 return 0; 655 atomic_store32(&span->heap_id, heap_id); 656 void* last_ptr; 657 do { 658 last_ptr = atomic_load_ptr(&heap->defer_unmap); 659 span->next_span = (span_t*)last_ptr; 660 } while (!atomic_cas_ptr(&heap->defer_unmap, span, last_ptr)); 661 return 1; 662 } 663 664 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings) 665 static void 666 _memory_unmap_span(heap_t* heap, span_t* span) { 667 size_t span_count = SPAN_COUNT(span->flags); 668 assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); 669 //A plain run of spans can be unmapped directly 670 if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) { 671 _memory_unmap(span, span_count * _memory_span_size, span->data.list.align_offset, 1); 672 return; 673 } 674 675 uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER); 676 span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size)); 677 678 assert(is_master || SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); 679 assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER)); 680 681 //Check if we own the master span, otherwise defer (only owner of master span can modify remainder field) 682 int32_t master_heap_id = atomic_load32(&master->heap_id); 683 if (heap && (master_heap_id != heap->id)) { 684 if (_memory_unmap_defer(master_heap_id, span)) 685 return; 686 } 687 if (!is_master) { 688 //Directly unmap subspans 689 assert(span->data.list.align_offset == 0); 690 _memory_unmap(span, span_count * _memory_span_size, 0, 0); 691 _memory_statistics_sub(&_reserved_spans, span_count); 692 } 693 else { 694 //Special double flag to denote an unmapped master 695 //It must be kept in memory since span header must be used 696 span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN; 697 } 698 //We are in owner thread of the master span 699 uint32_t remains = SPAN_REMAINS(master->flags); 700 assert(remains >= span_count); 701 remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count); 702 if (!remains) { 703 //Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span 704 assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN)); 705 span_count = SPAN_COUNT(master->flags); 706 _memory_unmap(master, span_count * _memory_span_size, master->data.list.align_offset, 1); 707 _memory_statistics_sub(&_reserved_spans, span_count); 708 } 709 else { 710 //Set remaining spans 711 SPAN_SET_REMAINS(master->flags, remains); 712 } 713 } 714 715 //! Process pending deferred cross-thread unmaps 716 static span_t* 717 _memory_unmap_deferred(heap_t* heap, size_t wanted_count) { 718 //Grab the current list of deferred unmaps 719 atomic_thread_fence_acquire(); 720 span_t* span = (span_t*)atomic_load_ptr(&heap->defer_unmap); 721 if (!span || !atomic_cas_ptr(&heap->defer_unmap, 0, span)) 722 return 0; 723 span_t* found_span = 0; 724 do { 725 //Verify that we own the master span, otherwise re-defer to owner 726 void* next = span->next_span; 727 size_t span_count = SPAN_COUNT(span->flags); 728 if (!found_span && span_count == wanted_count) { 729 assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); 730 found_span = span; 731 } 732 else { 733 uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER); 734 span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size)); 735 int32_t master_heap_id = atomic_load32(&master->heap_id); 736 if ((atomic_load32(&span->heap_id) == master_heap_id) || 737 !_memory_unmap_defer(master_heap_id, span)) { 738 //We own the master span (or heap merged and abandoned) 739 _memory_unmap_span(heap, span); 740 } 741 } 742 span = (span_t*)next; 743 } while (span); 744 return found_span; 745 } 746 747 //! Unmap a single linked list of spans 748 static void 749 _memory_unmap_span_list(heap_t* heap, span_t* span) { 750 size_t list_size = span->data.list.size; 751 for (size_t ispan = 0; ispan < list_size; ++ispan) { 752 span_t* next_span = span->next_span; 753 _memory_unmap_span(heap, span); 754 span = next_span; 755 } 756 assert(!span); 757 } 758 759 #if ENABLE_THREAD_CACHE 760 761 //! Split a super span in two 762 static span_t* 763 _memory_span_split(heap_t* heap, span_t* span, size_t use_count) { 764 uint16_t distance = 0; 765 size_t current_count = SPAN_COUNT(span->flags); 766 assert(current_count > use_count); 767 assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); 768 if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) { 769 //Must store heap in master span before use, to avoid issues when unmapping subspans 770 atomic_store32(&span->heap_id, heap->id); 771 atomic_thread_fence_release(); 772 span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count); 773 _memory_statistics_add(&_reserved_spans, current_count); 774 } 775 else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) { 776 //Only valid to call on master span if we own it 777 assert(atomic_load32(&span->heap_id) == heap->id); 778 uint16_t remains = SPAN_REMAINS(span->flags); 779 assert(remains >= current_count); 780 span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count); 781 } 782 else { //SPAN_FLAG_SUBSPAN 783 distance = SPAN_DISTANCE(span->flags); 784 span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count); 785 } 786 //Setup remainder as a subspan 787 span_t* subspan = (span_t*)pointer_offset(span, use_count * _memory_span_size); 788 subspan->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance + use_count, current_count - use_count); 789 subspan->data.list.align_offset = 0; 790 return subspan; 791 } 792 793 //! Add span to head of single linked span list 794 static size_t 795 _memory_span_list_push(span_t** head, span_t* span) { 796 span->next_span = *head; 797 if (*head) 798 span->data.list.size = (*head)->data.list.size + 1; 799 else 800 span->data.list.size = 1; 801 *head = span; 802 return span->data.list.size; 803 } 804 805 //! Remove span from head of single linked span list, returns the new list head 806 static span_t* 807 _memory_span_list_pop(span_t** head) { 808 span_t* span = *head; 809 span_t* next_span = 0; 810 if (span->data.list.size > 1) { 811 next_span = span->next_span; 812 assert(next_span); 813 next_span->data.list.size = span->data.list.size - 1; 814 } 815 *head = next_span; 816 return span; 817 } 818 819 //! Split a single linked span list 820 static span_t* 821 _memory_span_list_split(span_t* span, size_t limit) { 822 span_t* next = 0; 823 if (limit < 2) 824 limit = 2; 825 if (span->data.list.size > limit) { 826 count_t list_size = 1; 827 span_t* last = span; 828 next = span->next_span; 829 while (list_size < limit) { 830 last = next; 831 next = next->next_span; 832 ++list_size; 833 } 834 last->next_span = 0; 835 assert(next); 836 next->data.list.size = span->data.list.size - list_size; 837 span->data.list.size = list_size; 838 span->prev_span = 0; 839 } 840 return next; 841 } 842 843 #endif 844 845 //! Add a span to a double linked list 846 static void 847 _memory_span_list_doublelink_add(span_t** head, span_t* span) { 848 if (*head) { 849 (*head)->prev_span = span; 850 span->next_span = *head; 851 } 852 else { 853 span->next_span = 0; 854 } 855 *head = span; 856 } 857 858 //! Remove a span from a double linked list 859 static void 860 _memory_span_list_doublelink_remove(span_t** head, span_t* span) { 861 if (*head == span) { 862 *head = span->next_span; 863 } 864 else { 865 span_t* next_span = span->next_span; 866 span_t* prev_span = span->prev_span; 867 if (next_span) 868 next_span->prev_span = prev_span; 869 prev_span->next_span = next_span; 870 } 871 } 872 873 #if ENABLE_GLOBAL_CACHE 874 875 //! Insert the given list of memory page spans in the global cache 876 static void 877 _memory_cache_insert(heap_t* heap, global_cache_t* cache, span_t* span, size_t cache_limit) { 878 assert((span->data.list.size == 1) || (span->next_span != 0)); 879 int32_t list_size = (int32_t)span->data.list.size; 880 //Unmap if cache has reached the limit 881 if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) { 882 _memory_unmap_span_list(heap, span); 883 atomic_add32(&cache->size, -list_size); 884 return; 885 } 886 void* current_cache, *new_cache; 887 do { 888 current_cache = atomic_load_ptr(&cache->cache); 889 span->prev_span = (span_t*)(void*)((uintptr_t)current_cache & _memory_span_mask); 890 new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); 891 } while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache)); 892 } 893 894 //! Extract a number of memory page spans from the global cache 895 static span_t* 896 _memory_cache_extract(global_cache_t* cache) { 897 uintptr_t span_ptr; 898 do { 899 void* global_span = atomic_load_ptr(&cache->cache); 900 span_ptr = (uintptr_t)global_span & _memory_span_mask; 901 if (span_ptr) { 902 span_t* span = (span_t*)(void*)span_ptr; 903 //By accessing the span ptr before it is swapped out of list we assume that a contending thread 904 //does not manage to traverse the span to being unmapped before we access it 905 void* new_cache = (void*)((uintptr_t)span->prev_span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); 906 if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) { 907 atomic_add32(&cache->size, -(int32_t)span->data.list.size); 908 return span; 909 } 910 } 911 } while (span_ptr); 912 return 0; 913 } 914 915 //! Finalize a global cache, only valid from allocator finalization (not thread safe) 916 static void 917 _memory_cache_finalize(global_cache_t* cache) { 918 void* current_cache = atomic_load_ptr(&cache->cache); 919 span_t* span = (span_t*)(void*)((uintptr_t)current_cache & _memory_span_mask); 920 while (span) { 921 span_t* skip_span = (span_t*)(void*)((uintptr_t)span->prev_span & _memory_span_mask); 922 atomic_add32(&cache->size, -(int32_t)span->data.list.size); 923 _memory_unmap_span_list(0, span); 924 span = skip_span; 925 } 926 assert(!atomic_load32(&cache->size)); 927 atomic_store_ptr(&cache->cache, 0); 928 atomic_store32(&cache->size, 0); 929 } 930 931 //! Insert the given list of memory page spans in the global cache 932 static void 933 _memory_global_cache_insert(heap_t* heap, span_t* span) { 934 //Calculate adaptive limits 935 size_t span_count = SPAN_COUNT(span->flags); 936 const size_t cache_divisor = (span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2); 937 const size_t cache_limit = (MAX_GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor; 938 const size_t cache_limit_min = MAX_GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE); 939 _memory_cache_insert(heap, &_memory_span_cache[span_count - 1], span, cache_limit > cache_limit_min ? cache_limit : cache_limit_min); 940 } 941 942 //! Extract a number of memory page spans from the global cache for large blocks 943 static span_t* 944 _memory_global_cache_extract(size_t span_count) { 945 span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]); 946 assert(!span || ((size_t)SPAN_COUNT(span->flags) == span_count)); 947 return span; 948 } 949 950 #endif 951 952 //! Insert a single span into thread heap cache, releasing to global cache if overflow 953 static void 954 _memory_heap_cache_insert(heap_t* heap, span_t* span) { 955 #if ENABLE_THREAD_CACHE 956 size_t span_count = SPAN_COUNT(span->flags); 957 size_t idx = span_count - 1; 958 if (_memory_span_list_push(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit) 959 return; 960 heap->span_cache[idx] = _memory_span_list_split(span, heap->span_counter[idx].cache_limit); 961 assert(span->data.list.size == heap->span_counter[idx].cache_limit); 962 #if ENABLE_STATISTICS 963 heap->thread_to_global += (size_t)span->data.list.size * span_count * _memory_span_size; 964 #endif 965 #if ENABLE_GLOBAL_CACHE 966 _memory_global_cache_insert(heap, span); 967 #else 968 _memory_unmap_span_list(heap, span); 969 #endif 970 #else 971 _memory_unmap_span(heap, span); 972 #endif 973 } 974 975 //! Extract the given number of spans from the different cache levels 976 static span_t* 977 _memory_heap_cache_extract(heap_t* heap, size_t span_count) { 978 #if ENABLE_THREAD_CACHE 979 size_t idx = span_count - 1; 980 //Step 1: check thread cache 981 if (heap->span_cache[idx]) 982 return _memory_span_list_pop(&heap->span_cache[idx]); 983 #endif 984 //Step 2: Check reserved spans 985 if (heap->spans_reserved >= span_count) 986 return _memory_map_spans(heap, span_count); 987 //Step 3: Try processing deferred unmappings 988 span_t* span = _memory_unmap_deferred(heap, span_count); 989 if (span) 990 return span; 991 #if ENABLE_THREAD_CACHE 992 //Step 4: Check larger super spans and split if we find one 993 for (++idx; idx < LARGE_CLASS_COUNT; ++idx) { 994 if (heap->span_cache[idx]) { 995 span = _memory_span_list_pop(&heap->span_cache[idx]); 996 break; 997 } 998 } 999 if (span) { 1000 //Mark the span as owned by this heap before splitting 1001 size_t got_count = SPAN_COUNT(span->flags); 1002 assert(got_count > span_count); 1003 atomic_store32(&span->heap_id, heap->id); 1004 atomic_thread_fence_release(); 1005 1006 //Split the span and store as reserved if no previously reserved spans, or in thread cache otherwise 1007 span_t* subspan = _memory_span_split(heap, span, span_count); 1008 assert((size_t)(SPAN_COUNT(span->flags) + SPAN_COUNT(subspan->flags)) == got_count); 1009 assert((size_t)SPAN_COUNT(span->flags) == span_count); 1010 if (!heap->spans_reserved) { 1011 heap->spans_reserved = got_count - span_count; 1012 heap->span_reserve = subspan; 1013 heap->span_reserve_master = (span_t*)pointer_offset(subspan, -(int32_t)SPAN_DISTANCE(subspan->flags) * (int32_t)_memory_span_size); 1014 } 1015 else { 1016 _memory_heap_cache_insert(heap, subspan); 1017 } 1018 return span; 1019 } 1020 #if ENABLE_GLOBAL_CACHE 1021 //Step 5: Extract from global cache 1022 idx = span_count - 1; 1023 heap->span_cache[idx] = _memory_global_cache_extract(span_count); 1024 if (heap->span_cache[idx]) { 1025 #if ENABLE_STATISTICS 1026 heap->global_to_thread += (size_t)heap->span_cache[idx]->data.list.size * span_count * _memory_span_size; 1027 #endif 1028 return _memory_span_list_pop(&heap->span_cache[idx]); 1029 } 1030 #endif 1031 #endif 1032 return 0; 1033 } 1034 1035 //! Allocate a small/medium sized memory block from the given heap 1036 static void* 1037 _memory_allocate_from_heap(heap_t* heap, size_t size) { 1038 //Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes) 1039 const size_t base_idx = (size <= SMALL_SIZE_LIMIT) ? 1040 ((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) : 1041 SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT); 1042 assert(!base_idx || ((base_idx - 1) < SIZE_CLASS_COUNT)); 1043 const size_t class_idx = _memory_size_class[base_idx ? (base_idx - 1) : 0].class_idx; 1044 1045 span_block_t* active_block = heap->active_block + class_idx; 1046 size_class_t* size_class = _memory_size_class + class_idx; 1047 const count_t class_size = size_class->size; 1048 1049 //Step 1: Try to get a block from the currently active span. The span block bookkeeping 1050 // data for the active span is stored in the heap for faster access 1051 use_active: 1052 if (active_block->free_count) { 1053 //Happy path, we have a span with at least one free block 1054 span_t* span = heap->active_span[class_idx]; 1055 count_t offset = class_size * active_block->free_list; 1056 uint32_t* block = (uint32_t*)pointer_offset(span, SPAN_HEADER_SIZE + offset); 1057 assert(span); 1058 1059 --active_block->free_count; 1060 if (!active_block->free_count) { 1061 //Span is now completely allocated, set the bookkeeping data in the 1062 //span itself and reset the active span pointer in the heap 1063 span->data.block.free_count = 0; 1064 span->data.block.first_autolink = (uint16_t)size_class->block_count; 1065 heap->active_span[class_idx] = 0; 1066 } 1067 else { 1068 //Get the next free block, either from linked list or from auto link 1069 if (active_block->free_list < active_block->first_autolink) { 1070 active_block->free_list = (uint16_t)(*block); 1071 } 1072 else { 1073 ++active_block->free_list; 1074 ++active_block->first_autolink; 1075 } 1076 assert(active_block->free_list < size_class->block_count); 1077 } 1078 1079 return block; 1080 } 1081 1082 //Step 2: No active span, try executing deferred deallocations and try again if there 1083 // was at least one of the requested size class 1084 if (_memory_deallocate_deferred(heap, class_idx)) { 1085 if (active_block->free_count) 1086 goto use_active; 1087 } 1088 1089 //Step 3: Check if there is a semi-used span of the requested size class available 1090 if (heap->size_cache[class_idx]) { 1091 //Promote a pending semi-used span to be active, storing bookkeeping data in 1092 //the heap structure for faster access 1093 span_t* span = heap->size_cache[class_idx]; 1094 *active_block = span->data.block; 1095 assert(active_block->free_count > 0); 1096 heap->size_cache[class_idx] = span->next_span; 1097 heap->active_span[class_idx] = span; 1098 1099 //Mark span as owned by this heap 1100 atomic_store32(&span->heap_id, heap->id); 1101 atomic_thread_fence_release(); 1102 1103 goto use_active; 1104 } 1105 1106 //Step 4: Find a span in one of the cache levels 1107 span_t* span = _memory_heap_cache_extract(heap, 1); 1108 if (!span) { 1109 //Step 5: Map in more virtual memory 1110 span = _memory_map_spans(heap, 1); 1111 } 1112 1113 //Mark span as owned by this heap and set base data 1114 assert(SPAN_COUNT(span->flags) == 1); 1115 span->size_class = (uint16_t)class_idx; 1116 atomic_store32(&span->heap_id, heap->id); 1117 atomic_thread_fence_release(); 1118 1119 //If we only have one block we will grab it, otherwise 1120 //set span as new span to use for next allocation 1121 if (size_class->block_count > 1) { 1122 //Reset block order to sequential auto linked order 1123 active_block->free_count = (uint16_t)(size_class->block_count - 1); 1124 active_block->free_list = 1; 1125 active_block->first_autolink = 1; 1126 heap->active_span[class_idx] = span; 1127 } 1128 else { 1129 span->data.block.free_count = 0; 1130 span->data.block.first_autolink = (uint16_t)size_class->block_count; 1131 } 1132 1133 //Track counters 1134 _memory_counter_increase(&heap->span_counter[0], &_memory_max_allocation[0], 1); 1135 1136 //Return first block if memory page span 1137 return pointer_offset(span, SPAN_HEADER_SIZE); 1138 } 1139 1140 //! Allocate a large sized memory block from the given heap 1141 static void* 1142 _memory_allocate_large_from_heap(heap_t* heap, size_t size) { 1143 //Calculate number of needed max sized spans (including header) 1144 //Since this function is never called if size > LARGE_SIZE_LIMIT 1145 //the span_count is guaranteed to be <= LARGE_CLASS_COUNT 1146 size += SPAN_HEADER_SIZE; 1147 size_t span_count = size >> _memory_span_size_shift; 1148 if (size & (_memory_span_size - 1)) 1149 ++span_count; 1150 size_t idx = span_count - 1; 1151 1152 #if ENABLE_THREAD_CACHE 1153 if (!heap->span_cache[idx]) 1154 _memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx); 1155 #else 1156 _memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx); 1157 #endif 1158 //Step 1: Find span in one of the cache levels 1159 span_t* span = _memory_heap_cache_extract(heap, span_count); 1160 if (!span) { 1161 //Step 2: Map in more virtual memory 1162 span = _memory_map_spans(heap, span_count); 1163 } 1164 1165 //Mark span as owned by this heap and set base data 1166 assert((size_t)SPAN_COUNT(span->flags) == span_count); 1167 span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx); 1168 atomic_store32(&span->heap_id, heap->id); 1169 atomic_thread_fence_release(); 1170 1171 //Increase counter 1172 _memory_counter_increase(&heap->span_counter[idx], &_memory_max_allocation[idx], span_count); 1173 1174 return pointer_offset(span, SPAN_HEADER_SIZE); 1175 } 1176 1177 //! Allocate a new heap 1178 static heap_t* 1179 _memory_allocate_heap(void) { 1180 void* raw_heap; 1181 void* next_raw_heap; 1182 uintptr_t orphan_counter; 1183 heap_t* heap; 1184 heap_t* next_heap; 1185 //Try getting an orphaned heap 1186 atomic_thread_fence_acquire(); 1187 do { 1188 raw_heap = atomic_load_ptr(&_memory_orphan_heaps); 1189 heap = (heap_t*)(void*)((uintptr_t)raw_heap & _memory_page_mask); 1190 if (!heap) 1191 break; 1192 next_heap = heap->next_orphan; 1193 orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); 1194 next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & ~_memory_page_mask)); 1195 } 1196 while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap)); 1197 1198 if (!heap) { 1199 //Map in pages for a new heap 1200 size_t align_offset = 0; 1201 heap = (heap_t*)_memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset); 1202 memset(heap, 0, sizeof(heap_t)); 1203 heap->align_offset = align_offset; 1204 1205 //Get a new heap ID 1206 do { 1207 heap->id = atomic_incr32(&_memory_heap_id); 1208 if (_memory_heap_lookup(heap->id)) 1209 heap->id = 0; 1210 } while (!heap->id); 1211 1212 //Link in heap in heap ID map 1213 size_t list_idx = heap->id % HEAP_ARRAY_SIZE; 1214 do { 1215 next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); 1216 heap->next_heap = next_heap; 1217 } while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap)); 1218 } 1219 1220 #if ENABLE_THREAD_CACHE 1221 heap->span_counter[0].cache_limit = MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE; 1222 for (size_t idx = 1; idx < LARGE_CLASS_COUNT; ++idx) 1223 heap->span_counter[idx].cache_limit = MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE; 1224 #endif 1225 1226 //Clean up any deferred operations 1227 _memory_unmap_deferred(heap, 0); 1228 _memory_deallocate_deferred(heap, 0); 1229 1230 return heap; 1231 } 1232 1233 //! Deallocate the given small/medium memory block from the given heap 1234 static void 1235 _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) { 1236 //Check if span is the currently active span in order to operate 1237 //on the correct bookkeeping data 1238 assert(SPAN_COUNT(span->flags) == 1); 1239 const count_t class_idx = span->size_class; 1240 size_class_t* size_class = _memory_size_class + class_idx; 1241 int is_active = (heap->active_span[class_idx] == span); 1242 span_block_t* block_data = is_active ? 1243 heap->active_block + class_idx : 1244 &span->data.block; 1245 1246 //Check if the span will become completely free 1247 if (block_data->free_count == ((count_t)size_class->block_count - 1)) { 1248 #if ENABLE_THREAD_CACHE 1249 //Track counters 1250 assert(heap->span_counter[0].current_allocations > 0); 1251 if (heap->span_counter[0].current_allocations) 1252 --heap->span_counter[0].current_allocations; 1253 #endif 1254 1255 //If it was active, reset counter. Otherwise, if not active, remove from 1256 //partial free list if we had a previous free block (guard for classes with only 1 block) 1257 if (is_active) 1258 block_data->free_count = 0; 1259 else if (block_data->free_count > 0) 1260 _memory_span_list_doublelink_remove(&heap->size_cache[class_idx], span); 1261 1262 //Add to heap span cache 1263 _memory_heap_cache_insert(heap, span); 1264 return; 1265 } 1266 1267 //Check if first free block for this span (previously fully allocated) 1268 if (block_data->free_count == 0) { 1269 //add to free list and disable autolink 1270 _memory_span_list_doublelink_add(&heap->size_cache[class_idx], span); 1271 block_data->first_autolink = (uint16_t)size_class->block_count; 1272 } 1273 ++block_data->free_count; 1274 //Span is not yet completely free, so add block to the linked list of free blocks 1275 void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); 1276 count_t block_offset = (count_t)pointer_diff(p, blocks_start); 1277 count_t block_idx = block_offset / (count_t)size_class->size; 1278 uint32_t* block = (uint32_t*)pointer_offset(blocks_start, block_idx * size_class->size); 1279 *block = block_data->free_list; 1280 block_data->free_list = (uint16_t)block_idx; 1281 } 1282 1283 //! Deallocate the given large memory block from the given heap 1284 static void 1285 _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) { 1286 //Decrease counter 1287 size_t idx = (size_t)span->size_class - SIZE_CLASS_COUNT; 1288 size_t span_count = idx + 1; 1289 assert((size_t)SPAN_COUNT(span->flags) == span_count); 1290 assert(span->size_class >= SIZE_CLASS_COUNT); 1291 assert(idx < LARGE_CLASS_COUNT); 1292 #if ENABLE_THREAD_CACHE 1293 assert(heap->span_counter[idx].current_allocations > 0); 1294 if (heap->span_counter[idx].current_allocations) 1295 --heap->span_counter[idx].current_allocations; 1296 #endif 1297 if (!heap->spans_reserved && (span_count > 1)) { 1298 //Split the span and store remainder as reserved spans 1299 //Must split to a dummy 1-span master since we cannot have master spans as reserved 1300 _memory_set_span_remainder_as_reserved(heap, span, 1); 1301 span_count = 1; 1302 } 1303 1304 //Insert into cache list 1305 _memory_heap_cache_insert(heap, span); 1306 } 1307 1308 //! Process pending deferred cross-thread deallocations 1309 static int 1310 _memory_deallocate_deferred(heap_t* heap, size_t size_class) { 1311 //Grab the current list of deferred deallocations 1312 atomic_thread_fence_acquire(); 1313 void* p = atomic_load_ptr(&heap->defer_deallocate); 1314 if (!p || !atomic_cas_ptr(&heap->defer_deallocate, 0, p)) 1315 return 0; 1316 //Keep track if we deallocate in the given size class 1317 int got_class = 0; 1318 do { 1319 void* next = *(void**)p; 1320 //Get span and check which type of block 1321 span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); 1322 if (span->size_class < SIZE_CLASS_COUNT) { 1323 //Small/medium block 1324 got_class |= (span->size_class == size_class); 1325 _memory_deallocate_to_heap(heap, span, p); 1326 } 1327 else { 1328 //Large block 1329 got_class |= ((span->size_class >= size_class) && (span->size_class <= (size_class + 2))); 1330 _memory_deallocate_large_to_heap(heap, span); 1331 } 1332 //Loop until all pending operations in list are processed 1333 p = next; 1334 } while (p); 1335 return got_class; 1336 } 1337 1338 //! Defer deallocation of the given block to the given heap 1339 static void 1340 _memory_deallocate_defer(int32_t heap_id, void* p) { 1341 //Get the heap and link in pointer in list of deferred operations 1342 heap_t* heap = _memory_heap_lookup(heap_id); 1343 if (!heap) 1344 return; 1345 void* last_ptr; 1346 do { 1347 last_ptr = atomic_load_ptr(&heap->defer_deallocate); 1348 *(void**)p = last_ptr; //Safe to use block, it's being deallocated 1349 } while (!atomic_cas_ptr(&heap->defer_deallocate, p, last_ptr)); 1350 } 1351 1352 //! Allocate a block of the given size 1353 static void* 1354 _memory_allocate(size_t size) { 1355 if (size <= _memory_medium_size_limit) 1356 return _memory_allocate_from_heap(get_thread_heap(), size); 1357 else if (size <= LARGE_SIZE_LIMIT) 1358 return _memory_allocate_large_from_heap(get_thread_heap(), size); 1359 1360 //Oversized, allocate pages directly 1361 size += SPAN_HEADER_SIZE; 1362 size_t num_pages = size >> _memory_page_size_shift; 1363 if (size & (_memory_page_size - 1)) 1364 ++num_pages; 1365 size_t align_offset = 0; 1366 span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset); 1367 atomic_store32(&span->heap_id, 0); 1368 //Store page count in next_span 1369 span->next_span = (span_t*)((uintptr_t)num_pages); 1370 span->data.list.align_offset = (uint16_t)align_offset; 1371 1372 return pointer_offset(span, SPAN_HEADER_SIZE); 1373 } 1374 1375 //! Deallocate the given block 1376 static void 1377 _memory_deallocate(void* p) { 1378 if (!p) 1379 return; 1380 1381 //Grab the span (always at start of span, using 64KiB alignment) 1382 span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); 1383 int32_t heap_id = atomic_load32(&span->heap_id); 1384 heap_t* heap = get_thread_heap(); 1385 //Check if block belongs to this heap or if deallocation should be deferred 1386 if (heap_id == heap->id) { 1387 if (span->size_class < SIZE_CLASS_COUNT) 1388 _memory_deallocate_to_heap(heap, span, p); 1389 else 1390 _memory_deallocate_large_to_heap(heap, span); 1391 } 1392 else if (heap_id > 0) { 1393 _memory_deallocate_defer(heap_id, p); 1394 } 1395 else { 1396 //Oversized allocation, page count is stored in next_span 1397 size_t num_pages = (size_t)span->next_span; 1398 _memory_unmap(span, num_pages * _memory_page_size, span->data.list.align_offset, 1); 1399 } 1400 } 1401 1402 //! Reallocate the given block to the given size 1403 static void* 1404 _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { 1405 if (p) { 1406 //Grab the span using guaranteed span alignment 1407 span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); 1408 int32_t heap_id = atomic_load32(&span->heap_id); 1409 if (heap_id) { 1410 if (span->size_class < SIZE_CLASS_COUNT) { 1411 //Small/medium sized block 1412 size_class_t* size_class = _memory_size_class + span->size_class; 1413 if ((size_t)size_class->size >= size) 1414 return p; //Still fits in block, never mind trying to save memory 1415 if (!oldsize) 1416 oldsize = size_class->size; 1417 } 1418 else { 1419 //Large block 1420 size_t total_size = size + SPAN_HEADER_SIZE; 1421 size_t num_spans = total_size >> _memory_span_size_shift; 1422 if (total_size & (_memory_span_mask - 1)) 1423 ++num_spans; 1424 size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; 1425 if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) 1426 return p; //Still fits and less than half of memory would be freed 1427 if (!oldsize) 1428 oldsize = (current_spans * _memory_span_size) - SPAN_HEADER_SIZE; 1429 } 1430 } 1431 else { 1432 //Oversized block 1433 size_t total_size = size + SPAN_HEADER_SIZE; 1434 size_t num_pages = total_size >> _memory_page_size_shift; 1435 if (total_size & (_memory_page_size - 1)) 1436 ++num_pages; 1437 //Page count is stored in next_span 1438 size_t current_pages = (size_t)span->next_span; 1439 if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) 1440 return p; //Still fits and less than half of memory would be freed 1441 if (!oldsize) 1442 oldsize = (current_pages * _memory_page_size) - SPAN_HEADER_SIZE; 1443 } 1444 } 1445 1446 //Size is greater than block size, need to allocate a new block and deallocate the old 1447 //Avoid hysteresis by overallocating if increase is small (below 37%) 1448 size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); 1449 void* block = _memory_allocate((size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size)); 1450 if (p) { 1451 if (!(flags & RPMALLOC_NO_PRESERVE)) 1452 memcpy(block, p, oldsize < size ? oldsize : size); 1453 _memory_deallocate(p); 1454 } 1455 1456 return block; 1457 } 1458 1459 //! Get the usable size of the given block 1460 static size_t 1461 _memory_usable_size(void* p) { 1462 //Grab the span using guaranteed span alignment 1463 span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); 1464 int32_t heap_id = atomic_load32(&span->heap_id); 1465 if (heap_id) { 1466 //Small/medium block 1467 if (span->size_class < SIZE_CLASS_COUNT) 1468 return _memory_size_class[span->size_class].size; 1469 1470 //Large block 1471 size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; 1472 return (current_spans * _memory_span_size) - SPAN_HEADER_SIZE; 1473 } 1474 1475 //Oversized block, page count is stored in next_span 1476 size_t current_pages = (size_t)span->next_span; 1477 return (current_pages * _memory_page_size) - SPAN_HEADER_SIZE; 1478 } 1479 1480 //! Adjust and optimize the size class properties for the given class 1481 static void 1482 _memory_adjust_size_class(size_t iclass) { 1483 size_t block_size = _memory_size_class[iclass].size; 1484 size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; 1485 1486 _memory_size_class[iclass].block_count = (uint16_t)block_count; 1487 _memory_size_class[iclass].class_idx = (uint16_t)iclass; 1488 1489 //Check if previous size classes can be merged 1490 size_t prevclass = iclass; 1491 while (prevclass > 0) { 1492 --prevclass; 1493 //A class can be merged if number of pages and number of blocks are equal 1494 if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) { 1495 memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); 1496 } 1497 else { 1498 break; 1499 } 1500 } 1501 } 1502 1503 } 1504 1505 #if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) 1506 # include <windows.h> 1507 #else 1508 # include <sys/mman.h> 1509 # include <sched.h> 1510 # ifndef MAP_UNINITIALIZED 1511 # define MAP_UNINITIALIZED 0 1512 # endif 1513 #endif 1514 #include <errno.h> 1515 1516 namespace tracy 1517 { 1518 1519 //! Initialize the allocator and setup global data 1520 int 1521 rpmalloc_initialize(void) { 1522 memset(&_memory_config, 0, sizeof(rpmalloc_config_t)); 1523 return rpmalloc_initialize_config(0); 1524 } 1525 1526 int 1527 rpmalloc_initialize_config(const rpmalloc_config_t* config) { 1528 if (config) 1529 memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); 1530 1531 if (!_memory_config.memory_map || !_memory_config.memory_unmap) { 1532 _memory_config.memory_map = _memory_map_os; 1533 _memory_config.memory_unmap = _memory_unmap_os; 1534 } 1535 1536 _memory_page_size = _memory_config.page_size; 1537 if (!_memory_page_size) { 1538 #if PLATFORM_WINDOWS 1539 SYSTEM_INFO system_info; 1540 memset(&system_info, 0, sizeof(system_info)); 1541 GetSystemInfo(&system_info); 1542 _memory_page_size = system_info.dwPageSize; 1543 _memory_map_granularity = system_info.dwAllocationGranularity; 1544 #else 1545 _memory_page_size = (size_t)sysconf(_SC_PAGESIZE); 1546 _memory_map_granularity = _memory_page_size; 1547 #endif 1548 } 1549 1550 if (_memory_page_size < 512) 1551 _memory_page_size = 512; 1552 if (_memory_page_size > (16 * 1024)) 1553 _memory_page_size = (16 * 1024); 1554 1555 _memory_page_size_shift = 0; 1556 size_t page_size_bit = _memory_page_size; 1557 while (page_size_bit != 1) { 1558 ++_memory_page_size_shift; 1559 page_size_bit >>= 1; 1560 } 1561 _memory_page_size = ((size_t)1 << _memory_page_size_shift); 1562 _memory_page_mask = ~(uintptr_t)(_memory_page_size - 1); 1563 1564 size_t span_size = _memory_config.span_size; 1565 if (!span_size) 1566 span_size = (64 * 1024); 1567 if (span_size > (256 * 1024)) 1568 span_size = (256 * 1024); 1569 _memory_span_size = 4096; 1570 _memory_span_size_shift = 12; 1571 while ((_memory_span_size < span_size) || (_memory_span_size < _memory_page_size)) { 1572 _memory_span_size <<= 1; 1573 ++_memory_span_size_shift; 1574 } 1575 _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); 1576 1577 _memory_config.page_size = _memory_page_size; 1578 _memory_config.span_size = _memory_span_size; 1579 1580 if (!_memory_config.span_map_count) 1581 _memory_config.span_map_count = DEFAULT_SPAN_MAP_COUNT; 1582 if (_memory_config.span_size * _memory_config.span_map_count < _memory_config.page_size) 1583 _memory_config.span_map_count = (_memory_config.page_size / _memory_config.span_size); 1584 if (_memory_config.span_map_count > 128) 1585 _memory_config.span_map_count = 128; 1586 1587 #if defined(__APPLE__) && ENABLE_PRELOAD 1588 if (pthread_key_create(&_memory_thread_heap, 0)) 1589 return -1; 1590 #endif 1591 1592 atomic_store32(&_memory_heap_id, 0); 1593 atomic_store32(&_memory_orphan_counter, 0); 1594 atomic_store32(&_memory_active_heaps, 0); 1595 1596 //Setup all small and medium size classes 1597 size_t iclass; 1598 for (iclass = 0; iclass < SMALL_CLASS_COUNT; ++iclass) { 1599 size_t size = (iclass + 1) * SMALL_GRANULARITY; 1600 _memory_size_class[iclass].size = (uint16_t)size; 1601 _memory_adjust_size_class(iclass); 1602 } 1603 1604 _memory_medium_size_limit = _memory_span_size - SPAN_HEADER_SIZE; 1605 if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) 1606 _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; 1607 for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { 1608 size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); 1609 if (size > _memory_medium_size_limit) 1610 size = _memory_medium_size_limit; 1611 _memory_size_class[SMALL_CLASS_COUNT + iclass].size = (uint16_t)size; 1612 _memory_adjust_size_class(SMALL_CLASS_COUNT + iclass); 1613 } 1614 1615 //Initialize this thread 1616 rpmalloc_thread_initialize(); 1617 return 0; 1618 } 1619 1620 //! Finalize the allocator 1621 void 1622 rpmalloc_finalize(void) { 1623 atomic_thread_fence_acquire(); 1624 1625 rpmalloc_thread_finalize(); 1626 //If you hit this assert, you still have active threads or forgot to finalize some thread(s) 1627 assert(atomic_load32(&_memory_active_heaps) == 0); 1628 1629 //Free all thread caches 1630 for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { 1631 heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); 1632 while (heap) { 1633 _memory_deallocate_deferred(heap, 0); 1634 1635 //Free span caches (other thread might have deferred after the thread using this heap finalized) 1636 #if ENABLE_THREAD_CACHE 1637 for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { 1638 if (heap->span_cache[iclass]) 1639 _memory_unmap_span_list(0, heap->span_cache[iclass]); 1640 } 1641 #endif 1642 heap = heap->next_heap; 1643 } 1644 } 1645 1646 #if ENABLE_GLOBAL_CACHE 1647 //Free global caches 1648 for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) 1649 _memory_cache_finalize(&_memory_span_cache[iclass]); 1650 #endif 1651 1652 for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { 1653 heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); 1654 atomic_store_ptr(&_memory_heaps[list_idx], 0); 1655 while (heap) { 1656 if (heap->spans_reserved) { 1657 span_t* span = heap->span_reserve; 1658 span_t* master = heap->span_reserve_master; 1659 uint32_t remains = SPAN_REMAINS(master->flags); 1660 1661 assert(master != span); 1662 assert(remains >= heap->spans_reserved); 1663 _memory_unmap(span, heap->spans_reserved * _memory_span_size, 0, 0); 1664 _memory_statistics_sub(&_reserved_spans, heap->spans_reserved); 1665 remains = ((uint32_t)heap->spans_reserved >= remains) ? 0 : (remains - (uint32_t)heap->spans_reserved); 1666 if (!remains) { 1667 uint32_t master_span_count = SPAN_COUNT(master->flags); 1668 _memory_statistics_sub(&_reserved_spans, master_span_count); 1669 _memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1); 1670 } 1671 else { 1672 SPAN_SET_REMAINS(master->flags, remains); 1673 } 1674 } 1675 1676 _memory_unmap_deferred(heap, 0); 1677 1678 heap_t* next_heap = heap->next_heap; 1679 _memory_unmap(heap, (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, heap->align_offset, 1); 1680 heap = next_heap; 1681 } 1682 } 1683 atomic_store_ptr(&_memory_orphan_heaps, 0); 1684 atomic_thread_fence_release(); 1685 1686 #if ENABLE_STATISTICS 1687 //If you hit these asserts you probably have memory leaks or double frees in your code 1688 assert(!atomic_load32(&_mapped_pages)); 1689 assert(!atomic_load32(&_reserved_spans)); 1690 #endif 1691 1692 #if defined(__APPLE__) && ENABLE_PRELOAD 1693 pthread_key_delete(_memory_thread_heap); 1694 #endif 1695 } 1696 1697 //! Initialize thread, assign heap 1698 void 1699 rpmalloc_thread_initialize(void) { 1700 if (!get_thread_heap()) { 1701 atomic_incr32(&_memory_active_heaps); 1702 heap_t* heap = _memory_allocate_heap(); 1703 #if ENABLE_STATISTICS 1704 heap->thread_to_global = 0; 1705 heap->global_to_thread = 0; 1706 #endif 1707 set_thread_heap(heap); 1708 } 1709 } 1710 1711 //! Finalize thread, orphan heap 1712 void 1713 rpmalloc_thread_finalize(void) { 1714 heap_t* heap = get_thread_heap(); 1715 if (!heap) 1716 return; 1717 1718 _memory_deallocate_deferred(heap, 0); 1719 _memory_unmap_deferred(heap, 0); 1720 1721 //Release thread cache spans back to global cache 1722 #if ENABLE_THREAD_CACHE 1723 for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { 1724 span_t* span = heap->span_cache[iclass]; 1725 #if ENABLE_GLOBAL_CACHE 1726 const size_t span_count = iclass + 1; 1727 while (span) { 1728 assert((size_t)SPAN_COUNT(span->flags) == span_count); 1729 span_t* next = _memory_span_list_split(span, !iclass ? MIN_SPAN_CACHE_RELEASE : (MIN_LARGE_SPAN_CACHE_RELEASE / span_count)); 1730 _memory_global_cache_insert(0, span); 1731 span = next; 1732 } 1733 #else 1734 if (span) 1735 _memory_unmap_span_list(heap, span); 1736 #endif 1737 heap->span_cache[iclass] = 0; 1738 } 1739 #endif 1740 1741 //Orphan the heap 1742 void* raw_heap; 1743 uintptr_t orphan_counter; 1744 heap_t* last_heap; 1745 do { 1746 last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps); 1747 heap->next_orphan = (heap_t*)(void*)((uintptr_t)last_heap & _memory_page_mask); 1748 orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); 1749 raw_heap = (void*)((uintptr_t)heap | (orphan_counter & ~_memory_page_mask)); 1750 } 1751 while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap)); 1752 1753 set_thread_heap(0); 1754 atomic_add32(&_memory_active_heaps, -1); 1755 } 1756 1757 int 1758 rpmalloc_is_thread_initialized(void) { 1759 return (get_thread_heap() != 0) ? 1 : 0; 1760 } 1761 1762 const rpmalloc_config_t* 1763 rpmalloc_config(void) { 1764 return &_memory_config; 1765 } 1766 1767 //! Map new pages to virtual memory 1768 static void* 1769 _memory_map_os(size_t size, size_t* offset) { 1770 //Either size is a heap (a single page) or a (multiple) span - we only need to align spans 1771 size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0; 1772 1773 #if PLATFORM_WINDOWS 1774 //Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed" 1775 void* ptr = VirtualAlloc(0, size + padding, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); 1776 if (!ptr) { 1777 assert("Failed to map virtual memory block" && 0); 1778 return 0; 1779 } 1780 #else 1781 void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0); 1782 if ((ptr == MAP_FAILED) || !ptr) { 1783 assert("Failed to map virtual memory block" && 0); 1784 return 0; 1785 } 1786 #endif 1787 1788 if (padding) { 1789 size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); 1790 #if PLATFORM_POSIX 1791 //Unmap the last unused pages, for Windows this is done with the final VirtualFree with MEM_RELEASE call 1792 size_t remains = padding - final_padding; 1793 if (remains) 1794 munmap(pointer_offset(ptr, final_padding + size), remains); 1795 #endif 1796 ptr = pointer_offset(ptr, final_padding); 1797 assert(final_padding <= _memory_span_size); 1798 assert(!(final_padding & 5)); 1799 assert(!((uintptr_t)ptr & ~_memory_span_mask)); 1800 *offset = final_padding >> 3; 1801 assert(*offset < 65536); 1802 } 1803 1804 return ptr; 1805 } 1806 1807 //! Unmap pages from virtual memory 1808 static void 1809 _memory_unmap_os(void* address, size_t size, size_t offset, int release) { 1810 assert(release || (offset == 0)); 1811 if (release && offset) { 1812 offset <<= 3; 1813 #if PLATFORM_POSIX 1814 size += offset; 1815 #endif 1816 address = pointer_offset(address, -(int32_t)offset); 1817 } 1818 #if PLATFORM_WINDOWS 1819 if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) { 1820 DWORD err = GetLastError(); 1821 (void)err; 1822 assert("Failed to unmap virtual memory block" && 0); 1823 } 1824 #else 1825 MEMORY_UNUSED(release); 1826 if (munmap(address, size)) { 1827 assert("Failed to unmap virtual memory block" && 0); 1828 } 1829 #endif 1830 } 1831 1832 #if ENABLE_GUARDS 1833 static void 1834 _memory_guard_validate(void* p) { 1835 if (!p) 1836 return; 1837 void* block_start; 1838 size_t block_size = _memory_usable_size(p); 1839 span_t* span = (void*)((uintptr_t)p & _memory_span_mask); 1840 int32_t heap_id = atomic_load32(&span->heap_id); 1841 if (heap_id) { 1842 if (span->size_class < SIZE_CLASS_COUNT) { 1843 void* span_blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); 1844 size_class_t* size_class = _memory_size_class + span->size_class; 1845 count_t block_offset = (count_t)pointer_diff(p, span_blocks_start); 1846 count_t block_idx = block_offset / (count_t)size_class->size; 1847 block_start = pointer_offset(span_blocks_start, block_idx * size_class->size); 1848 } 1849 else { 1850 block_start = pointer_offset(span, SPAN_HEADER_SIZE); 1851 } 1852 } 1853 else { 1854 block_start = pointer_offset(span, SPAN_HEADER_SIZE); 1855 } 1856 uint32_t* deadzone = block_start; 1857 //If these asserts fire, you have written to memory before the block start 1858 for (int i = 0; i < 8; ++i) { 1859 if (deadzone[i] != MAGIC_GUARD) { 1860 if (_memory_config.memory_overwrite) 1861 _memory_config.memory_overwrite(p); 1862 else 1863 assert("Memory overwrite before block start" && 0); 1864 return; 1865 } 1866 deadzone[i] = 0; 1867 } 1868 deadzone = (uint32_t*)pointer_offset(block_start, block_size - 32); 1869 //If these asserts fire, you have written to memory after the block end 1870 for (int i = 0; i < 8; ++i) { 1871 if (deadzone[i] != MAGIC_GUARD) { 1872 if (_memory_config.memory_overwrite) 1873 _memory_config.memory_overwrite(p); 1874 else 1875 assert("Memory overwrite after block end" && 0); 1876 return; 1877 } 1878 deadzone[i] = 0; 1879 } 1880 } 1881 #else 1882 #define _memory_guard_validate(block) 1883 #endif 1884 1885 #if ENABLE_GUARDS 1886 static void 1887 _memory_guard_block(void* block) { 1888 if (block) { 1889 size_t block_size = _memory_usable_size(block); 1890 uint32_t* deadzone = block; 1891 deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = 1892 deadzone[4] = deadzone[5] = deadzone[6] = deadzone[7] = MAGIC_GUARD; 1893 deadzone = (uint32_t*)pointer_offset(block, block_size - 32); 1894 deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = 1895 deadzone[4] = deadzone[5] = deadzone[6] = deadzone[7] = MAGIC_GUARD; 1896 } 1897 } 1898 #define _memory_guard_pre_alloc(size) size += 64 1899 #define _memory_guard_pre_realloc(block, size) block = pointer_offset(block, -32); size += 64 1900 #define _memory_guard_post_alloc(block, size) _memory_guard_block(block); block = pointer_offset(block, 32); size -= 64 1901 #else 1902 #define _memory_guard_pre_alloc(size) 1903 #define _memory_guard_pre_realloc(block, size) 1904 #define _memory_guard_post_alloc(block, size) 1905 #endif 1906 1907 // Extern interface 1908 1909 TRACY_API RPMALLOC_RESTRICT void* 1910 rpmalloc(size_t size) { 1911 #if ENABLE_VALIDATE_ARGS 1912 if (size >= MAX_ALLOC_SIZE) { 1913 errno = EINVAL; 1914 return 0; 1915 } 1916 #endif 1917 _memory_guard_pre_alloc(size); 1918 void* block = _memory_allocate(size); 1919 _memory_guard_post_alloc(block, size); 1920 return block; 1921 } 1922 1923 TRACY_API void 1924 rpfree(void* ptr) { 1925 _memory_guard_validate(ptr); 1926 _memory_deallocate(ptr); 1927 } 1928 1929 RPMALLOC_RESTRICT void* 1930 rpcalloc(size_t num, size_t size) { 1931 size_t total; 1932 #if ENABLE_VALIDATE_ARGS 1933 #if PLATFORM_WINDOWS 1934 int err = SizeTMult(num, size, &total); 1935 if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { 1936 errno = EINVAL; 1937 return 0; 1938 } 1939 #else 1940 int err = __builtin_umull_overflow(num, size, &total); 1941 if (err || (total >= MAX_ALLOC_SIZE)) { 1942 errno = EINVAL; 1943 return 0; 1944 } 1945 #endif 1946 #else 1947 total = num * size; 1948 #endif 1949 _memory_guard_pre_alloc(total); 1950 void* block = _memory_allocate(total); 1951 _memory_guard_post_alloc(block, total); 1952 memset(block, 0, total); 1953 return block; 1954 } 1955 1956 void* 1957 rprealloc(void* ptr, size_t size) { 1958 #if ENABLE_VALIDATE_ARGS 1959 if (size >= MAX_ALLOC_SIZE) { 1960 errno = EINVAL; 1961 return ptr; 1962 } 1963 #endif 1964 _memory_guard_validate(ptr); 1965 _memory_guard_pre_realloc(ptr, size); 1966 void* block = _memory_reallocate(ptr, size, 0, 0); 1967 _memory_guard_post_alloc(block, size); 1968 return block; 1969 } 1970 1971 void* 1972 rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, 1973 unsigned int flags) { 1974 #if ENABLE_VALIDATE_ARGS 1975 if ((size + alignment < size) || (alignment > _memory_page_size)) { 1976 errno = EINVAL; 1977 return 0; 1978 } 1979 #endif 1980 void* block; 1981 if (alignment > 32) { 1982 block = rpaligned_alloc(alignment, size); 1983 if (!(flags & RPMALLOC_NO_PRESERVE)) 1984 memcpy(block, ptr, oldsize < size ? oldsize : size); 1985 rpfree(ptr); 1986 } 1987 else { 1988 _memory_guard_validate(ptr); 1989 _memory_guard_pre_realloc(ptr, size); 1990 block = _memory_reallocate(ptr, size, oldsize, flags); 1991 _memory_guard_post_alloc(block, size); 1992 } 1993 return block; 1994 } 1995 1996 RPMALLOC_RESTRICT void* 1997 rpaligned_alloc(size_t alignment, size_t size) { 1998 if (alignment <= 32) 1999 return rpmalloc(size); 2000 2001 #if ENABLE_VALIDATE_ARGS 2002 if ((size + alignment < size) || (alignment > _memory_page_size)) { 2003 errno = EINVAL; 2004 return 0; 2005 } 2006 #endif 2007 2008 void* ptr = rpmalloc(size + alignment); 2009 if ((uintptr_t)ptr & (alignment - 1)) 2010 ptr = (void*)(((uintptr_t)ptr & ~((uintptr_t)alignment - 1)) + alignment); 2011 return ptr; 2012 } 2013 2014 RPMALLOC_RESTRICT void* 2015 rpmemalign(size_t alignment, size_t size) { 2016 return rpaligned_alloc(alignment, size); 2017 } 2018 2019 int 2020 rpposix_memalign(void **memptr, size_t alignment, size_t size) { 2021 if (memptr) 2022 *memptr = rpaligned_alloc(alignment, size); 2023 else 2024 return EINVAL; 2025 return *memptr ? 0 : ENOMEM; 2026 } 2027 2028 size_t 2029 rpmalloc_usable_size(void* ptr) { 2030 size_t size = 0; 2031 if (ptr) { 2032 size = _memory_usable_size(ptr); 2033 #if ENABLE_GUARDS 2034 size -= 64; 2035 #endif 2036 } 2037 return size; 2038 } 2039 2040 void 2041 rpmalloc_thread_collect(void) { 2042 heap_t* heap = get_thread_heap(); 2043 _memory_unmap_deferred(heap, 0); 2044 _memory_deallocate_deferred(0, 0); 2045 } 2046 2047 void 2048 rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { 2049 memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); 2050 heap_t* heap = get_thread_heap(); 2051 void* p = atomic_load_ptr(&heap->defer_deallocate); 2052 while (p) { 2053 void* next = *(void**)p; 2054 span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); 2055 stats->deferred += _memory_size_class[span->size_class].size; 2056 p = next; 2057 } 2058 2059 for (size_t isize = 0; isize < SIZE_CLASS_COUNT; ++isize) { 2060 if (heap->active_block[isize].free_count) 2061 stats->active += heap->active_block[isize].free_count * _memory_size_class[heap->active_span[isize]->size_class].size; 2062 2063 span_t* cache = heap->size_cache[isize]; 2064 while (cache) { 2065 stats->sizecache = cache->data.block.free_count * _memory_size_class[cache->size_class].size; 2066 cache = cache->next_span; 2067 } 2068 } 2069 2070 #if ENABLE_THREAD_CACHE 2071 for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { 2072 if (heap->span_cache[iclass]) 2073 stats->spancache = (size_t)heap->span_cache[iclass]->data.list.size * (iclass + 1) * _memory_span_size; 2074 } 2075 #endif 2076 } 2077 2078 void 2079 rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) { 2080 memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); 2081 #if ENABLE_STATISTICS 2082 stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; 2083 stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size; 2084 stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; 2085 #endif 2086 #if ENABLE_GLOBAL_CACHE 2087 for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { 2088 stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size; 2089 } 2090 #endif 2091 } 2092 2093 } 2094 2095 #ifdef _MSC_VER 2096 # pragma warning( pop ) 2097 #endif 2098 2099 #endif