Add tracy - mudgangster - Tiny, scriptable MUD client

commit 33577cc2ae781b48b73a27381c2c5cef48ceac62
parent 7ee87faf62aa5ae0407ec328c8ba33a82aca6299
Author: Michael Savage <mikejsavage@gmail.com>
Date:   Sun,  3 May 2020 01:03:43 +0300

Add tracy

Diffstat:
M ggbuild/gen_ninja.lua  | 2 +-
A libs/tracy.lua  | 3 +++
A libs/tracy/LICENSE  | 27 +++++++++++++++++++++++++++
A libs/tracy/Tracy.hpp  | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/TracyC.h  | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/TracyClient.cpp  | 43 +++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/TracyOpenGL.hpp  | 272 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracyArmCpuTable.hpp  | 319 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracyCallstack.cpp  | 603 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracyCallstack.h  | 29 +++++++++++++++++++++++++++++
A libs/tracy/client/TracyCallstack.hpp  | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracyDxt1.cpp  | 646 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracyDxt1.hpp  | 11 +++++++++++
A libs/tracy/client/TracyFastVector.hpp  | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracyLock.hpp  | 527 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracyProfiler.cpp  | 2729 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracyProfiler.hpp  | 620 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracyScoped.hpp  | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracySysTime.cpp  | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracySysTime.hpp  | 36 ++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracySysTrace.cpp  | 862 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracySysTrace.hpp  | 25 +++++++++++++++++++++++++
A libs/tracy/client/TracySysTracePayload.hpp  | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/TracyThread.hpp  | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/tracy_concurrentqueue.h  | 1552 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/tracy_rpmalloc.cpp  | 2099 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/client/tracy_rpmalloc.hpp  | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/TracyAlign.hpp  | 27 +++++++++++++++++++++++++++
A libs/tracy/common/TracyAlloc.hpp  | 33 +++++++++++++++++++++++++++++++++
A libs/tracy/common/TracyApi.h  | 14 ++++++++++++++
A libs/tracy/common/TracyColor.hpp  | 690 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/TracyForceInline.hpp  | 20 ++++++++++++++++++++
A libs/tracy/common/TracyMutex.hpp  | 33 +++++++++++++++++++++++++++++++++
A libs/tracy/common/TracyProtocol.hpp  | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/TracyQueue.hpp  | 500 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/TracySocket.cpp  | 561 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/TracySocket.hpp  | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/TracySystem.cpp  | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/TracySystem.hpp  | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/tracy_benaphore.h  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/tracy_lz4.cpp  | 2297 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/tracy_lz4.hpp  | 679 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A libs/tracy/common/tracy_sema.h  | 255 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M make.lua  | 8 +++++++-
M src/common.h  | 2 ++

45 files changed, 17211 insertions(+), 2 deletions(-)
diff --git a/ggbuild/gen_ninja.lua b/ggbuild/gen_ninja.lua
@@ -318,7 +318,7 @@ end
 
 local function rule_for_src( src_name )
 	local ext = src_name:match( "([^%.]+)$" )
-	return ( { cc = "cpp" } )[ ext ]
+	return ( { cc = "cpp", cpp = "cpp" } )[ ext ]
 end
 
 local function write_ninja_script()
diff --git a/libs/tracy.lua b/libs/tracy.lua
@@ -0,0 +1,3 @@
+lib( "tracy", { "libs/tracy/TracyClient.cpp" } )
+msvc_obj_cxxflags( "libs/tracy/TracyClient.cpp", "/O2" )
+gcc_obj_cxxflags( "libs/tracy/TracyClient.cpp", "-O2 -Wno-unused-function -Wno-maybe-uninitialized" )
diff --git a/libs/tracy/LICENSE b/libs/tracy/LICENSE
@@ -0,0 +1,27 @@
+Tracy Profiler (https://bitbucket.org/wolfpld/tracy) is licensed under the
+3-clause BSD license.
+
+Copyright (c) 2017-2019, Bartosz Taudul <wolf.pld@gmail.com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/libs/tracy/Tracy.hpp b/libs/tracy/Tracy.hpp
@@ -0,0 +1,177 @@
+#ifndef __TRACY_HPP__
+#define __TRACY_HPP__
+
+#include "common/TracyColor.hpp"
+#include "common/TracySystem.hpp"
+
+#ifndef TRACY_ENABLE
+
+#define ZoneNamed(x,y)
+#define ZoneNamedN(x,y,z)
+#define ZoneNamedC(x,y,z)
+#define ZoneNamedNC(x,y,z,w)
+
+#define ZoneScoped
+#define ZoneScopedN(x)
+#define ZoneScopedC(x)
+#define ZoneScopedNC(x,y)
+
+#define ZoneText(x,y)
+#define ZoneName(x,y)
+
+#define FrameMark
+#define FrameMarkNamed(x)
+#define FrameMarkStart(x)
+#define FrameMarkEnd(x)
+
+#define FrameImage(x,y,z,w,a)
+
+#define TracyLockable( type, varname ) type varname;
+#define TracyLockableN( type, varname, desc ) type varname;
+#define TracySharedLockable( type, varname ) type varname;
+#define TracySharedLockableN( type, varname, desc ) type varname;
+#define LockableBase( type ) type
+#define SharedLockableBase( type ) type
+#define LockMark(x) (void)x;
+
+#define TracyPlot(x,y)
+#define TracyPlotConfig(x,y)
+
+#define TracyMessage(x,y)
+#define TracyMessageL(x)
+#define TracyMessageC(x,y,z)
+#define TracyMessageLC(x,y)
+#define TracyAppInfo(x,y)
+
+#define TracyAlloc(x,y)
+#define TracyFree(x)
+
+#define ZoneNamedS(x,y,z)
+#define ZoneNamedNS(x,y,z,w)
+#define ZoneNamedCS(x,y,z,w)
+#define ZoneNamedNCS(x,y,z,w,a)
+
+#define ZoneScopedS(x)
+#define ZoneScopedNS(x,y)
+#define ZoneScopedCS(x,y)
+#define ZoneScopedNCS(x,y,z)
+
+#define TracyAllocS(x,y,z)
+#define TracyFreeS(x,y)
+
+#define TracyMessageS(x,y,z)
+#define TracyMessageLS(x,y)
+#define TracyMessageCS(x,y,z,w)
+#define TracyMessageLCS(x,y,z)
+
+#define TracyParameterRegister(x)
+#define TracyParameterSetup(x,y,z,w)
+
+#else
+
+#include "client/TracyLock.hpp"
+#include "client/TracyProfiler.hpp"
+#include "client/TracyScoped.hpp"
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define ZoneNamed( varname, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#  define ZoneNamedN( varname, name, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#  define ZoneNamedC( varname, color, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#  define ZoneNamedNC( varname, name, color, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#else
+#  define ZoneNamed( varname, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
+#  define ZoneNamedN( varname, name, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
+#  define ZoneNamedC( varname, color, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
+#  define ZoneNamedNC( varname, name, color, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
+#endif
+
+#define ZoneScoped ZoneNamed( ___tracy_scoped_zone, true )
+#define ZoneScopedN( name ) ZoneNamedN( ___tracy_scoped_zone, name, true )
+#define ZoneScopedC( color ) ZoneNamedC( ___tracy_scoped_zone, color, true )
+#define ZoneScopedNC( name, color ) ZoneNamedNC( ___tracy_scoped_zone, name, color, true )
+
+#define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size );
+#define ZoneName( txt, size ) ___tracy_scoped_zone.Name( txt, size );
+
+#define FrameMark tracy::Profiler::SendFrameMark( nullptr );
+#define FrameMarkNamed( name ) tracy::Profiler::SendFrameMark( name );
+#define FrameMarkStart( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart );
+#define FrameMarkEnd( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd );
+
+#define FrameImage( image, width, height, offset, flip ) tracy::Profiler::SendFrameImage( image, width, height, offset, flip );
+
+#define TracyLockable( type, varname ) tracy::Lockable<type> varname { [] () -> const tracy::SourceLocationData* { static const tracy::SourceLocationData srcloc { nullptr, #type " " #varname, __FILE__, __LINE__, 0 }; return &srcloc; }() };
+#define TracyLockableN( type, varname, desc ) tracy::Lockable<type> varname { [] () -> const tracy::SourceLocationData* { static const tracy::SourceLocationData srcloc { nullptr, desc, __FILE__, __LINE__, 0 }; return &srcloc; }() };
+#define TracySharedLockable( type, varname ) tracy::SharedLockable<type> varname { [] () -> const tracy::SourceLocationData* { static const tracy::SourceLocationData srcloc { nullptr, #type " " #varname, __FILE__, __LINE__, 0 }; return &srcloc; }() };
+#define TracySharedLockableN( type, varname, desc ) tracy::SharedLockable<type> varname { [] () -> const tracy::SourceLocationData* { static const tracy::SourceLocationData srcloc { nullptr, desc, __FILE__, __LINE__, 0 }; return &srcloc; }() };
+#define LockableBase( type ) tracy::Lockable<type>
+#define SharedLockableBase( type ) tracy::SharedLockable<type>
+#define LockMark( varname ) static const tracy::SourceLocationData __tracy_lock_location_##varname { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; varname.Mark( &__tracy_lock_location_##varname );
+
+#define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val );
+#define TracyPlotConfig( name, type ) tracy::Profiler::ConfigurePlot( name, type );
+
+#define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size );
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK );
+#  define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK );
+#  define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK );
+#  define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK );
+
+#  define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK );
+#  define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK );
+#else
+#  define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, 0 );
+#  define TracyMessageL( txt ) tracy::Profiler::Message( txt, 0 );
+#  define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, 0 );
+#  define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, 0 );
+
+#  define TracyAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size );
+#  define TracyFree( ptr ) tracy::Profiler::MemFree( ptr );
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define ZoneNamedS( varname, depth, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+#  define ZoneNamedNS( varname, name, depth, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+#  define ZoneNamedCS( varname, color, depth, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+#  define ZoneNamedNCS( varname, name, color, depth, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+
+#  define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true )
+#  define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true )
+#  define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true )
+#  define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color depth, true )
+
+#  define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth );
+#  define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth );
+
+#  define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth );
+#  define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth );
+#  define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth );
+#  define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth );
+#else
+#  define ZoneNamedS( varname, depth, active ) ZoneNamed( varname, active )
+#  define ZoneNamedNS( varname, name, depth, active ) ZoneNamedN( varname, name, active )
+#  define ZoneNamedCS( varname, color, depth, active ) ZoneNamedC( varname, color, active )
+#  define ZoneNamedNCS( varname, name, color, depth, active ) ZoneNamedNC( varname, name, color, active )
+
+#  define ZoneScopedS( depth ) ZoneScoped
+#  define ZoneScopedNS( name, depth ) ZoneScopedN( name )
+#  define ZoneScopedCS( color, depth ) ZoneScopedC( color )
+#  define ZoneScopedNCS( name, color, depth ) ZoneScopedNC( name, color )
+
+#  define TracyAllocS( ptr, size, depth ) TracyAlloc( ptr, size )
+#  define TracyFreeS( ptr, depth ) TracyFree( ptr )
+
+#  define TracyMessageS( txt, size, depth ) TracyMessage( txt, size )
+#  define TracyMessageLS( txt, depth ) TracyMessageL( txt )
+#  define TracyMessageCS( txt, size, color, depth ) TracyMessageC( txt, size, color )
+#  define TracyMessageLCS( txt, color, depth ) TracyMessageLC( txt, color )
+#endif
+
+#define TracyParameterRegister( cb ) tracy::Profiler::ParameterRegister( cb );
+#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val );
+
+#endif
+
+#endif
diff --git a/libs/tracy/TracyC.h b/libs/tracy/TracyC.h
@@ -0,0 +1,188 @@
+#ifndef __TRACYC_HPP__
+#define __TRACYC_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "client/TracyCallstack.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef TRACY_ENABLE
+
+typedef const void* TracyCZoneCtx;
+
+#define TracyCZone(c,x)
+#define TracyCZoneN(c,x,y)
+#define TracyCZoneC(c,x,y)
+#define TracyCZoneNC(c,x,y,z)
+#define TracyCZoneEnd(c)
+#define TracyCZoneText(c,x,y)
+#define TracyCZoneName(c,x,y)
+
+#define TracyCAlloc(x,y)
+#define TracyCFree(x)
+
+#define TracyCFrameMark
+#define TracyCFrameMarkNamed(x)
+#define TracyCFrameMarkStart(x)
+#define TracyCFrameMarkEnd(x)
+#define TracyCFrameImage(x,y,z,w,a)
+
+#define TracyCPlot(x,y)
+#define TracyCMessage(x,y)
+#define TracyCMessageL(x)
+#define TracyCMessageC(x,y,z)
+#define TracyCMessageLC(x,y)
+#define TracyCAppInfo(x,y)
+
+#define TracyCZoneS(x,y,z)
+#define TracyCZoneNS(x,y,z,w)
+#define TracyCZoneCS(x,y,z,w)
+#define TracyCZoneNCS(x,y,z,w,a)
+
+#define TracyCAllocS(x,y,z)
+#define TracyCFreeS(x,y)
+
+#define TracyCMessageS(x,y,z)
+#define TracyCMessageLS(x,y)
+#define TracyCMessageCS(x,y,z,w)
+#define TracyCMessageLCS(x,y,z)
+
+#else
+
+#ifndef TracyConcat
+#  define TracyConcat(x,y) TracyConcatIndirect(x,y)
+#endif
+#ifndef TracyConcatIndirect
+#  define TracyConcatIndirect(x,y) x##y
+#endif
+
+struct ___tracy_source_location_data
+{
+    const char* name;
+    const char* function;
+    const char* file;
+    uint32_t line;
+    uint32_t color;
+};
+
+struct ___tracy_c_zone_context
+{
+    uint32_t id;
+    int active;
+};
+
+// Some containers don't support storing const types.
+// This struct, as visible to user, is immutable, so treat it as if const was declared here.
+typedef /*const*/ struct ___tracy_c_zone_context TracyCZoneCtx;
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int active );
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int depth, int active );
+TRACY_API void ___tracy_emit_zone_end( TracyCZoneCtx ctx );
+TRACY_API void ___tracy_emit_zone_text( TracyCZoneCtx ctx, const char* txt, size_t size );
+TRACY_API void ___tracy_emit_zone_name( TracyCZoneCtx ctx, const char* txt, size_t size );
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyCZone( ctx, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#  define TracyCZoneN( ctx, name, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#  define TracyCZoneC( ctx, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#  define TracyCZoneNC( ctx, name, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#else
+#  define TracyCZone( ctx, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,__LINE__), active );
+#  define TracyCZoneN( ctx, name, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,__LINE__), active );
+#  define TracyCZoneC( ctx, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,__LINE__), active );
+#  define TracyCZoneNC( ctx, name, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,__LINE__), active );
+#endif
+
+#define TracyCZoneEnd( ctx ) ___tracy_emit_zone_end( ctx );
+
+#define TracyCZoneText( ctx, txt, size ) ___tracy_emit_zone_text( ctx, txt, size );
+#define TracyCZoneName( ctx, txt, size ) ___tracy_emit_zone_name( ctx, txt, size );
+
+
+TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size );
+TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int depth );
+TRACY_API void ___tracy_emit_memory_free( const void* ptr );
+TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int depth );
+
+TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack );
+TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack );
+TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack );
+TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int callstack );
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyCAlloc( ptr, size ) ___tracy_emit_memory_alloc_callstack( ptr, size, TRACY_CALLSTACK )
+#  define TracyCFree( ptr ) ___tracy_emit_memory_alloc_free_callstack( ptr, TRACY_CALLSTACK )
+
+#  define TracyCMessage( txt, size ) ___tracy_emit_message( txt, size, TRACY_CALLSTACK );
+#  define TracyCMessageL( txt ) ___tracy_emit_messageL( txt, TRACY_CALLSTACK );
+#  define TracyCMessageC( txt, size, color ) ___tracy_emit_messageC( txt, size, color, TRACY_CALLSTACK );
+#  define TracyCMessageLC( txt, color ) ___tracy_emit_messageLC( txt, color, TRACY_CALLSTACK );
+#else
+#  define TracyCAlloc( ptr, size ) ___tracy_emit_memory_alloc( ptr, size );
+#  define TracyCFree( ptr ) ___tracy_emit_memory_free( ptr );
+
+#  define TracyCMessage( txt, size ) ___tracy_emit_message( txt, size, 0 );
+#  define TracyCMessageL( txt ) ___tracy_emit_messageL( txt, 0 );
+#  define TracyCMessageC( txt, size, color ) ___tracy_emit_messageC( txt, size, color, 0 );
+#  define TracyCMessageLC( txt, color ) ___tracy_emit_messageLC( txt, color, 0 );
+#endif
+
+
+TRACY_API void ___tracy_emit_frame_mark( const char* name );
+TRACY_API void ___tracy_emit_frame_mark_start( const char* name );
+TRACY_API void ___tracy_emit_frame_mark_end( const char* name );
+TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int flip );
+
+#define TracyCFrameMark ___tracy_emit_frame_mark( 0 );
+#define TracyCFrameMarkNamed( name ) ___tracy_emit_frame_mark( name );
+#define TracyCFrameMarkStart( name ) ___tracy_emit_frame_mark_start( name );
+#define TracyCFrameMarkEnd( name ) ___tracy_emit_frame_mark_end( name );
+#define TracyCFrameImage( image, width, height, offset, flip ) ___tracy_emit_frame_image( image, width, height, offset, flip );
+
+
+TRACY_API void ___tracy_emit_plot( const char* name, double val );
+TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size );
+
+#define TracyCPlot( name, val ) ___tracy_emit_plot( name, val );
+#define TracyCAppInfo( txt, color ) ___tracy_emit_message_appinfo( txt, color );
+
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define TracyCZoneS( ctx, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+#  define TracyCZoneNS( ctx, name, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+#  define TracyCZoneCS( ctx, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+#  define TracyCZoneNCS( ctx, name, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+
+#  define TracyCAllocS( ptr, size, depth ) ___tracy_emit_memory_alloc_callstack( ptr, size, depth )
+#  define TracyCFreeS( ptr, depth ) ___tracy_emit_memory_alloc_free_callstack( ptr, depth )
+
+#  define TracyCMessageS( txt, size, depth ) ___tracy_emit_message( txt, size, depth );
+#  define TracyCMessageLS( txt, depth ) ___tracy_emit_messageL( txt, depth );
+#  define TracyCMessageCS( txt, size, color, depth ) ___tracy_emit_messageC( txt, size, color, depth );
+#  define TracyCMessageLCS( txt, color, depth ) ___tracy_emit_messageLC( txt, color, depth );
+#else
+#  define TracyCZoneS( ctx, depth, active ) TracyCZone( ctx, active )
+#  define TracyCZoneNS( ctx, name, depth, active ) TracyCZoneN( ctx, name, active )
+#  define TracyCZoneCS( ctx, color, depth, active ) TracyCZoneC( ctx, color, active )
+#  define TracyCZoneNCS( ctx, name, color, depth, active ) TracyCZoneNC( ctx, name, color, active )
+
+#  define TracyCAllocS( ptr, size, depth ) TracyCAlloc( ptr, size )
+#  define TracyCFreeS( ptr, depth ) TracyCFree( ptr )
+
+#  define TracyCMessageS( txt, size, depth ) TracyCMessage( txt, size )
+#  define TracyCMessageLS( txt, depth ) TracyCMessageL( txt )
+#  define TracyCMessageCS( txt, size, color, depth ) TracyCMessageC( txt, size, color )
+#  define TracyCMessageLCS( txt, color, depth ) TracyCMessageLC( txt, color )
+#endif
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libs/tracy/TracyClient.cpp b/libs/tracy/TracyClient.cpp
@@ -0,0 +1,43 @@
+//
+//          Tracy profiler
+//         ----------------
+//
+// For fast integration, compile and
+// link with this source file (and none
+// other) in your executable (or in the
+// main DLL / shared object on multi-DLL
+// projects).
+//
+
+// Define TRACY_ENABLE to enable profiler.
+
+#include "common/TracySystem.cpp"
+
+#ifdef TRACY_ENABLE
+
+#include "common/tracy_lz4.cpp"
+#include "client/TracyProfiler.cpp"
+#include "client/TracyCallstack.cpp"
+#include "client/TracySysTime.cpp"
+#include "client/TracySysTrace.cpp"
+#include "common/TracySocket.cpp"
+#include "client/tracy_rpmalloc.cpp"
+#include "client/TracyDxt1.cpp"
+
+#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 6
+#  include "libbacktrace/alloc.cpp"
+#  include "libbacktrace/dwarf.cpp"
+#  include "libbacktrace/elf.cpp"
+#  include "libbacktrace/fileline.cpp"
+#  include "libbacktrace/mmapio.cpp"
+#  include "libbacktrace/posix.cpp"
+#  include "libbacktrace/sort.cpp"
+#  include "libbacktrace/state.cpp"
+#endif
+
+#ifdef _MSC_VER
+#  pragma comment(lib, "ws2_32.lib")
+#  pragma comment(lib, "dbghelp.lib")
+#endif
+
+#endif
diff --git a/libs/tracy/TracyOpenGL.hpp b/libs/tracy/TracyOpenGL.hpp
@@ -0,0 +1,272 @@
+#ifndef __TRACYOPENGL_HPP__
+#define __TRACYOPENGL_HPP__
+
+// Include this file after you include OpenGL 3.2 headers.
+
+#if !defined TRACY_ENABLE || defined __APPLE__
+
+#define TracyGpuContext
+#define TracyGpuNamedZone(x,y)
+#define TracyGpuNamedZoneC(x,y,z)
+#define TracyGpuZone(x)
+#define TracyGpuZoneC(x,y)
+#define TracyGpuCollect
+
+#define TracyGpuNamedZoneS(x,y,z)
+#define TracyGpuNamedZoneCS(x,y,z,w)
+#define TracyGpuZoneS(x,y)
+#define TracyGpuZoneCS(x,y,z)
+
+namespace tracy
+{
+struct SourceLocationData;
+class GpuCtxScope
+{
+public:
+    GpuCtxScope( const SourceLocationData* ) {}
+    GpuCtxScope( const SourceLocationData*, int depth ) {}
+};
+}
+
+#else
+
+#include <atomic>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "Tracy.hpp"
+#include "client/TracyProfiler.hpp"
+#include "client/TracyCallstack.hpp"
+#include "common/TracyAlign.hpp"
+#include "common/TracyAlloc.hpp"
+
+#if !defined GL_TIMESTAMP && defined GL_TIMESTAMP_EXT
+#  define GL_TIMESTAMP GL_TIMESTAMP_EXT
+#  define GL_QUERY_COUNTER_BITS GL_QUERY_COUNTER_BITS_EXT
+#  define glGetQueryObjectiv glGetQueryObjectivEXT
+#  define glGetQueryObjectui64v glGetQueryObjectui64vEXT
+#  define glQueryCounter glQueryCounterEXT
+#endif
+
+#define TracyGpuContext tracy::GetGpuCtx().ptr = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::GetGpuCtx().ptr) tracy::GpuCtx;
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyGpuNamedZone( varname, name ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK );
+#  define TracyGpuNamedZoneC( varname, name, color ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK );
+#  define TracyGpuZone( name ) TracyGpuNamedZoneS( ___tracy_gpu_zone, name, TRACY_CALLSTACK )
+#  define TracyGpuZoneC( name, color ) TracyGpuNamedZoneCS( ___tracy_gpu_zone, name, color, TRACY_CALLSTACK )
+#else
+#  define TracyGpuNamedZone( varname, name ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__) );
+#  define TracyGpuNamedZoneC( varname, name, color ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__) );
+#  define TracyGpuZone( name ) TracyGpuNamedZone( ___tracy_gpu_zone, name )
+#  define TracyGpuZoneC( name, color ) TracyGpuNamedZoneC( ___tracy_gpu_zone, name, color )
+#endif
+#define TracyGpuCollect tracy::GetGpuCtx().ptr->Collect();
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define TracyGpuNamedZoneS( varname, name, depth ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), depth );
+#  define TracyGpuNamedZoneCS( varname, name, color, depth ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), depth );
+#  define TracyGpuZoneS( name, depth ) TracyGpuNamedZoneS( ___tracy_gpu_zone, name, depth )
+#  define TracyGpuZoneCS( name, color, depth ) TracyGpuNamedZoneCS( ___tracy_gpu_zone, name, color, depth )
+#else
+#  define TracyGpuNamedZoneS( varname, name, depth ) TracyGpuNamedZone( varname, name )
+#  define TracyGpuNamedZoneCS( varname, name, color, depth ) TracyGpuNamedZoneC( varname, name, color )
+#  define TracyGpuZoneS( name, depth ) TracyGpuZone( name )
+#  define TracyGpuZoneCS( name, color, depth ) TracyGpuZoneC( name, color )
+#endif
+
+namespace tracy
+{
+
+class GpuCtx
+{
+    friend class GpuCtxScope;
+
+    enum { QueryCount = 64 * 1024 };
+
+public:
+    GpuCtx()
+        : m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) )
+        , m_head( 0 )
+        , m_tail( 0 )
+    {
+        assert( m_context != 255 );
+
+        glGenQueries( QueryCount, m_query );
+
+        int64_t tgpu;
+        glGetInteger64v( GL_TIMESTAMP, &tgpu );
+        int64_t tcpu = Profiler::GetTime();
+
+        GLint bits;
+        glGetQueryiv( GL_TIMESTAMP, GL_QUERY_COUNTER_BITS, &bits );
+
+        const float period = 1.f;
+        Magic magic;
+        const auto thread = GetThreadHandle();
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::GpuNewContext );
+        MemWrite( &item->gpuNewContext.cpuTime, tcpu );
+        MemWrite( &item->gpuNewContext.gpuTime, tgpu );
+        MemWrite( &item->gpuNewContext.thread, thread );
+        MemWrite( &item->gpuNewContext.period, period );
+        MemWrite( &item->gpuNewContext.context, m_context );
+        MemWrite( &item->gpuNewContext.accuracyBits, (uint8_t)bits );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    void Collect()
+    {
+        ZoneScopedC( Color::Red4 );
+
+        if( m_tail == m_head ) return;
+
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() )
+        {
+            m_head = m_tail = 0;
+            return;
+        }
+#endif
+
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+
+        while( m_tail != m_head )
+        {
+            GLint available;
+            glGetQueryObjectiv( m_query[m_tail], GL_QUERY_RESULT_AVAILABLE, &available );
+            if( !available ) return;
+
+            uint64_t time;
+            glGetQueryObjectui64v( m_query[m_tail], GL_QUERY_RESULT, &time );
+
+            auto item = token->enqueue_begin( magic );
+            MemWrite( &item->hdr.type, QueueType::GpuTime );
+            MemWrite( &item->gpuTime.gpuTime, (int64_t)time );
+            MemWrite( &item->gpuTime.queryId, (uint16_t)m_tail );
+            MemWrite( &item->gpuTime.context, m_context );
+            tail.store( magic + 1, std::memory_order_release );
+
+            m_tail = ( m_tail + 1 ) % QueryCount;
+        }
+    }
+
+private:
+    tracy_force_inline unsigned int NextQueryId()
+    {
+        const auto id = m_head;
+        m_head = ( m_head + 1 ) % QueryCount;
+        assert( m_head != m_tail );
+        return id;
+    }
+
+    tracy_force_inline unsigned int TranslateOpenGlQueryId( unsigned int id )
+    {
+        return m_query[id];
+    }
+
+    tracy_force_inline uint8_t GetId() const
+    {
+        return m_context;
+    }
+
+    unsigned int m_query[QueryCount];
+    uint8_t m_context;
+
+    unsigned int m_head;
+    unsigned int m_tail;
+};
+
+class GpuCtxScope
+{
+public:
+    tracy_force_inline GpuCtxScope( const SourceLocationData* srcloc )
+#ifdef TRACY_ON_DEMAND
+        : m_active( GetProfiler().IsConnected() )
+#endif
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !m_active ) return;
+#endif
+        const auto queryId = GetGpuCtx().ptr->NextQueryId();
+        glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP );
+
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::GpuZoneBegin );
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
+        memset( &item->gpuZoneBegin.thread, 0, sizeof( item->gpuZoneBegin.thread ) );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    tracy_force_inline GpuCtxScope( const SourceLocationData* srcloc, int depth )
+#ifdef TRACY_ON_DEMAND
+        : m_active( GetProfiler().IsConnected() )
+#endif
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !m_active ) return;
+#endif
+        const auto queryId = GetGpuCtx().ptr->NextQueryId();
+        glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP );
+
+        Magic magic;
+        const auto thread = GetThreadHandle();
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstack );
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
+        MemWrite( &item->gpuZoneBegin.thread, thread );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() );
+        tail.store( magic + 1, std::memory_order_release );
+
+        GetProfiler().SendCallstack( depth );
+    }
+
+    tracy_force_inline ~GpuCtxScope()
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !m_active ) return;
+#endif
+        const auto queryId = GetGpuCtx().ptr->NextQueryId();
+        glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP );
+
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::GpuZoneEnd );
+        MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() );
+        memset( &item->gpuZoneEnd.thread, 0, sizeof( item->gpuZoneEnd.thread ) );
+        MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneEnd.context, GetGpuCtx().ptr->GetId() );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+private:
+#ifdef TRACY_ON_DEMAND
+    const bool m_active;
+#endif
+};
+
+}
+
+#endif
+
+#endif
diff --git a/libs/tracy/client/TracyArmCpuTable.hpp b/libs/tracy/client/TracyArmCpuTable.hpp
@@ -0,0 +1,319 @@
+namespace tracy
+{
+
+static const char* DecodeArmImplementer( uint32_t v )
+{
+    static char buf[16];
+    switch( v )
+    {
+    case 0x41: return "ARM";
+    case 0x42: return "Broadcom";
+    case 0x43: return "Cavium";
+    case 0x44: return "DEC";
+    case 0x46: return "Fujitsu";
+    case 0x48: return "HiSilicon";
+    case 0x4d: return "Motorola";
+    case 0x4e: return "Nvidia";
+    case 0x50: return "Applied Micro";
+    case 0x51: return "Qualcomm";
+    case 0x53: return "Samsung";
+    case 0x54: return "Texas Instruments";
+    case 0x56: return "Marvell";
+    case 0x61: return "Apple";
+    case 0x66: return "Faraday";
+    case 0x68: return "HXT";
+    case 0x69: return "Intel";
+    default: break;
+    }
+    sprintf( buf, "0x%x", v );
+    return buf;
+}
+
+static const char* DecodeArmPart( uint32_t impl, uint32_t part )
+{
+    static char buf[16];
+    switch( impl )
+    {
+    case 0x41:
+        switch( part )
+        {
+        case 0x810: return "810";
+        case 0x920: return "920";
+        case 0x922: return "922";
+        case 0x926: return "926";
+        case 0x940: return "940";
+        case 0x946: return "946";
+        case 0x966: return "966";
+        case 0xa20: return "1020";
+        case 0xa22: return "1022";
+        case 0xa26: return "1026";
+        case 0xb02: return "11 MPCore";
+        case 0xb36: return "1136";
+        case 0xb56: return "1156";
+        case 0xb76: return "1176";
+        case 0xc05: return " Cortex-A5";
+        case 0xc07: return " Cortex-A7";
+        case 0xc08: return " Cortex-A8";
+        case 0xc09: return " Cortex-A9";
+        case 0xc0c: return " Cortex-A12";
+        case 0xc0d: return " Rockchip RK3288";
+        case 0xc0f: return " Cortex-A15";
+        case 0xc0e: return " Cortex-A17";
+        case 0xc14: return " Cortex-R4";
+        case 0xc15: return " Cortex-R5";
+        case 0xc17: return " Cortex-R7";
+        case 0xc18: return " Cortex-R8";
+        case 0xc20: return " Cortex-M0";
+        case 0xc21: return " Cortex-M1";
+        case 0xc23: return " Cortex-M3";
+        case 0xc24: return " Cortex-M4";
+        case 0xc27: return " Cortex-M7";
+        case 0xc60: return " Cortex-M0+";
+        case 0xd00: return " AArch64 simulator";
+        case 0xd01: return " Cortex-A32";
+        case 0xd03: return " Cortex-A53";
+        case 0xd04: return " Cortex-A35";
+        case 0xd05: return " Cortex-A55";
+        case 0xd06: return " Cortex-A65";
+        case 0xd07: return " Cortex-A57";
+        case 0xd08: return " Cortex-A72";
+        case 0xd09: return " Cortex-A73";
+        case 0xd0a: return " Cortex-A75";
+        case 0xd0b: return " Cortex-A76";
+        case 0xd0c: return " Neoverse N1";
+        case 0xd0d: return " Cortex-A77";
+        case 0xd0e: return " Cortex-A76AE";
+        case 0xd0f: return " AEMv8";
+        case 0xd13: return " Cortex-R52";
+        case 0xd20: return " Cortex-M23";
+        case 0xd21: return " Cortex-M33";
+        case 0xd4a: return " Neoverse E1";
+        default: break;
+        }
+    case 0x42:
+        switch( part )
+        {
+        case 0xf: return " Brahma B15";
+        case 0x100: return " Brahma B53";
+        case 0x516: return " ThunderX2";
+        default: break;
+        }
+    case 0x43:
+        switch( part )
+        {
+        case 0xa0: return " ThunderX";
+        case 0xa1: return " ThunderX 88XX";
+        case 0xa2: return " ThunderX 81XX";
+        case 0xa3: return " ThunderX 83XX";
+        case 0xaf: return " ThunderX2 99xx";
+        default: break;
+        }
+    case 0x44:
+        switch( part )
+        {
+        case 0xa10: return " SA110";
+        case 0xa11: return " SA1100";
+        default: break;
+        }
+    case 0x46:
+        switch( part )
+        {
+        case 0x1: return " A64FX";
+        default: break;
+        }
+    case 0x48:
+        switch( part )
+        {
+        case 0xd01: return " TSV100";
+        case 0xd40: return " Kirin 980";
+        default: break;
+        }
+    case 0x4e:
+        switch( part )
+        {
+        case 0x0: return " Denver";
+        case 0x3: return " Denver 2";
+        case 0x4: return " Carmel";
+        default: break;
+        }
+    case 0x50:
+        switch( part )
+        {
+        case 0x0: return " X-Gene";
+        default: break;
+        }
+    case 0x51:
+        switch( part )
+        {
+        case 0xf: return " Scorpion";
+        case 0x2d: return " Scorpion";
+        case 0x4d: return " Krait";
+        case 0x6f: return " Krait";
+        case 0x200: return " Kryo";
+        case 0x201: return " Kryo Silver (Snapdragon 821)";
+        case 0x205: return " Kryo Gold";
+        case 0x211: return " Kryo Silver (Snapdragon 820)";
+        case 0x800: return " Kryo 260 / 280 Gold";
+        case 0x801: return " Kryo 260 / 280 Silver";
+        case 0x802: return " Kryo 385 Gold";
+        case 0x803: return " Kryo 385 Silver";
+        case 0x804: return " Kryo 485 Gold";
+        case 0xc00: return " Falkor";
+        case 0xc01: return " Saphira";
+        default: break;
+        }
+    case 0x53:
+        switch( part )
+        {
+        case 0x1: return " Exynos M1/M2";
+        case 0x2: return " Exynos M3";
+        default: break;
+        }
+    case 0x56:
+        switch( part )
+        {
+        case 0x131: return " Feroceon 88FR131";
+        case 0x581: return " PJ4 / PJ4B";
+        case 0x584: return " PJ4B-MP / PJ4C";
+        default: break;
+        }
+    case 0x61:
+        switch( part )
+        {
+        case 0x1: return " Cyclone";
+        case 0x2: return " Typhoon";
+        case 0x3: return " Typhoon/Capri";
+        case 0x4: return " Twister";
+        case 0x5: return " Twister/Elba/Malta";
+        case 0x6: return " Hurricane";
+        case 0x7: return " Hurricane/Myst";
+        default: break;
+        }
+    case 0x66:
+        switch( part )
+        {
+        case 0x526: return " FA526";
+        case 0x626: return " FA626";
+        default: break;
+        }
+    case 0x68:
+        switch( part )
+        {
+        case 0x0: return " Phecda";
+        default: break;
+        }
+    default: break;
+    }
+    sprintf( buf, " 0x%x", part );
+    return buf;
+}
+
+static const char* DecodeIosDevice( const char* id )
+{
+    static const char* DeviceTable[] = {
+        "i386", "32-bit simulator",
+        "x86_64", "64-bit simulator",
+        "iPhone1,1", "iPhone",
+        "iPhone1,2", "iPhone 3G",
+        "iPhone2,1", "iPhone 3GS",
+        "iPhone3,1", "iPhone 4 (GSM)",
+        "iPhone3,2", "iPhone 4 (GSM)",
+        "iPhone3,3", "iPhone 4 (CDMA)",
+        "iPhone4,1", "iPhone 4S",
+        "iPhone5,1", "iPhone 5 (A1428)",
+        "iPhone5,2", "iPhone 5 (A1429)",
+        "iPhone5,3", "iPhone 5c (A1456/A1532)",
+        "iPhone5,4", "iPhone 5c (A1507/A1516/1526/A1529)",
+        "iPhone6,1", "iPhone 5s (A1433/A1533)",
+        "iPhone6,2", "iPhone 5s (A1457/A1518/A1528/A1530)",
+        "iPhone7,1", "iPhone 6 Plus",
+        "iPhone7,2", "iPhone 6",
+        "iPhone8,1", "iPhone 6S",
+        "iPhone8,2", "iPhone 6S Plus",
+        "iPhone8,4", "iPhone SE",
+        "iPhone9,1", "iPhone 7 (CDMA)",
+        "iPhone9,2", "iPhone 7 Plus (CDMA)",
+        "iPhone9,3", "iPhone 7 (GSM)",
+        "iPhone9,4", "iPhone 7 Plus (GSM)",
+        "iPhone10,1", "iPhone 8 (CDMA)",
+        "iPhone10,2", "iPhone 8 Plus (CDMA)",
+        "iPhone10,3", "iPhone X (CDMA)",
+        "iPhone10,4", "iPhone 8 (GSM)",
+        "iPhone10,5", "iPhone 8 Plus (GSM)",
+        "iPhone10,6", "iPhone X (GSM)",
+        "iPhone11,2", "iPhone XS",
+        "iPhone11,4", "iPhone XS Max",
+        "iPhone11,6", "iPhone XS Max China",
+        "iPhone11,8", "iPhone XR",
+        "iPad1,1", "iPad (A1219/A1337)",
+        "iPad2,1", "iPad 2 (A1395)",
+        "iPad2,2", "iPad 2 (A1396)",
+        "iPad2,3", "iPad 2 (A1397)",
+        "iPad2,4", "iPad 2 (A1395)",
+        "iPad2,5", "iPad Mini (A1432)",
+        "iPad2,6", "iPad Mini (A1454)",
+        "iPad2,7", "iPad Mini (A1455)",
+        "iPad3,1", "iPad 3 (A1416)",
+        "iPad3,2", "iPad 3 (A1403)",
+        "iPad3,3", "iPad 3 (A1430)",
+        "iPad3,4", "iPad 4 (A1458)",
+        "iPad3,5", "iPad 4 (A1459)",
+        "iPad3,6", "iPad 4 (A1460)",
+        "iPad4,1", "iPad Air (A1474)",
+        "iPad4,2", "iPad Air (A1475)",
+        "iPad4,3", "iPad Air (A1476)",
+        "iPad4,4", "iPad Mini 2 (A1489)",
+        "iPad4,5", "iPad Mini 2 (A1490)",
+        "iPad4,6", "iPad Mini 2 (A1491)",
+        "iPad4,7", "iPad Mini 3 (A1599)",
+        "iPad4,8", "iPad Mini 3 (A1600)",
+        "iPad4,9", "iPad Mini 3 (A1601)",
+        "iPad5,1", "iPad Mini 4 (A1538)",
+        "iPad5,2", "iPad Mini 4 (A1550)",
+        "iPad5,3", "iPad Air 2 (A1566)",
+        "iPad5,4", "iPad Air 2 (A1567)",
+        "iPad6,3", "iPad Pro 9.7\" (A1673)",
+        "iPad6,4", "iPad Pro 9.7\" (A1674)",
+        "iPad6,5", "iPad Pro 9.7\" (A1675)",
+        "iPad6,7", "iPad Pro 12.9\" (A1584)",
+        "iPad6,8", "iPad Pro 12.9\" (A1652)",
+        "iPad6,11", "iPad 5th gen (A1822)",
+        "iPad6,12", "iPad 5th gen (A1823)",
+        "iPad7,1", "iPad Pro 12.9\" 2nd gen (A1670)",
+        "iPad7,2", "iPad Pro 12.9\" 2nd gen (A1671/A1821)",
+        "iPad7,3", "iPad Pro 10.5\" (A1701)",
+        "iPad7,4", "iPad Pro 10.5\" (A1709)",
+        "iPad7,5", "iPad 6th gen (A1893)",
+        "iPad7,6", "iPad 6th gen (A1954)",
+        "iPad8,1", "iPad Pro 11\" (A1980)",
+        "iPad8,2", "iPad Pro 11\" (A1980)",
+        "iPad8,3", "iPad Pro 11\" (A1934/A1979/A2013)",
+        "iPad8,4", "iPad Pro 11\" (A1934/A1979/A2013)",
+        "iPad8,5", "iPad Pro 12.9\" 3rd gen (A1876)",
+        "iPad8,6", "iPad Pro 12.9\" 3rd gen (A1876)",
+        "iPad8,7", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)",
+        "iPad8,8", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)",
+        "iPad11,1", "iPad Mini 5th gen (A2133)",
+        "iPad11,2", "iPad Mini 5th gen (A2124/A2125/A2126)",
+        "iPad11,3", "iPad Air 3rd gen (A2152)",
+        "iPad11,4", "iPad Air 3rd gen (A2123/A2153/A2154)",
+        "iPod1,1", "iPod Touch",
+        "iPod2,1", "iPod Touch 2nd gen",
+        "iPod3,1", "iPod Touch 3rd gen",
+        "iPod4,1", "iPod Touch 4th gen",
+        "iPod5,1", "iPod Touch 5th gen",
+        "iPod7,1", "iPod Touch 6th gen",
+        "iPod9,1", "iPod Touch 7th gen",
+        nullptr
+    };
+
+    auto ptr = DeviceTable;
+    while( *ptr )
+    {
+        if( strcmp( ptr[0], id ) == 0 ) return ptr[1];
+        ptr += 2;
+    }
+    return id;
+}
+
+}
diff --git a/libs/tracy/client/TracyCallstack.cpp b/libs/tracy/client/TracyCallstack.cpp
@@ -0,0 +1,603 @@
+#include <stdio.h>
+#include <string.h>
+#include "TracyCallstack.hpp"
+
+#ifdef TRACY_HAS_CALLSTACK
+
+#if TRACY_HAS_CALLSTACK == 1
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <windows.h>
+#  ifdef _MSC_VER
+#    pragma warning( push )
+#    pragma warning( disable : 4091 )
+#  endif
+#  include <dbghelp.h>
+#  ifdef _MSC_VER
+#    pragma warning( pop )
+#  endif
+#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 6
+#  include "../libbacktrace/backtrace.hpp"
+#  include <dlfcn.h>
+#  include <cxxabi.h>
+#elif TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 5
+#  include <dlfcn.h>
+#  include <cxxabi.h>
+#endif
+
+namespace tracy
+{
+
+#if TRACY_HAS_CALLSTACK == 1
+
+enum { MaxCbTrace = 16 };
+
+int cb_num;
+CallstackEntry cb_data[MaxCbTrace];
+
+extern "C" { t_RtlWalkFrameChain RtlWalkFrameChain = 0; }
+
+#if defined __MINGW32__ && API_VERSION_NUMBER < 12
+extern "C" {
+// Actual required API_VERSION_NUMBER is unknown because it is undocumented. These functions are not present in at least v11.
+DWORD IMAGEAPI SymAddrIncludeInlineTrace(HANDLE hProcess, DWORD64 Address);
+BOOL IMAGEAPI SymQueryInlineTrace(HANDLE hProcess, DWORD64 StartAddress, DWORD StartContext, DWORD64 StartRetAddress,
+    DWORD64 CurAddress, LPDWORD CurContext, LPDWORD CurFrameIndex);
+BOOL IMAGEAPI SymFromInlineContext(HANDLE hProcess, DWORD64 Address, ULONG InlineContext, PDWORD64 Displacement,
+    PSYMBOL_INFO Symbol);
+BOOL IMAGEAPI SymGetLineFromInlineContext(HANDLE hProcess, DWORD64 qwAddr, ULONG InlineContext,
+    DWORD64 qwModuleBaseAddress, PDWORD pdwDisplacement, PIMAGEHLP_LINE64 Line64);
+};
+#endif
+
+void InitCallstack()
+{
+#ifdef UNICODE
+    RtlWalkFrameChain = (t_RtlWalkFrameChain)GetProcAddress( GetModuleHandle( L"ntdll.dll" ), "RtlWalkFrameChain" );
+#else
+    RtlWalkFrameChain = (t_RtlWalkFrameChain)GetProcAddress( GetModuleHandle( "ntdll.dll" ), "RtlWalkFrameChain" );
+#endif
+    SymInitialize( GetCurrentProcess(), nullptr, true );
+    SymSetOptions( SYMOPT_LOAD_LINES );
+}
+
+const char* DecodeCallstackPtrFast( uint64_t ptr )
+{
+    static char ret[1024];
+    const auto proc = GetCurrentProcess();
+
+    char buf[sizeof( SYMBOL_INFO ) + 1024];
+    auto si = (SYMBOL_INFO*)buf;
+    si->SizeOfStruct = sizeof( SYMBOL_INFO );
+    si->MaxNameLen = 1024;
+
+    if( SymFromAddr( proc, ptr, nullptr, si ) == 0 )
+    {
+        *ret = '\0';
+    }
+    else
+    {
+        memcpy( ret, si->Name, si->NameLen );
+        ret[si->NameLen] = '\0';
+    }
+    return ret;
+}
+
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
+{
+    int write;
+    const auto proc = GetCurrentProcess();
+#ifndef __CYGWIN__
+    DWORD inlineNum = SymAddrIncludeInlineTrace( proc, ptr );
+    if( inlineNum > MaxCbTrace - 1 ) inlineNum = MaxCbTrace - 1;
+    DWORD ctx = 0;
+    DWORD idx;
+    BOOL doInline = FALSE;
+    if( inlineNum != 0 ) doInline = SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx );
+    if( doInline )
+    {
+        write = inlineNum;
+        cb_num = 1 + inlineNum;
+    }
+    else
+#endif
+    {
+        write = 0;
+        cb_num = 1;
+    }
+
+    char buf[sizeof( SYMBOL_INFO ) + 1024];
+    auto si = (SYMBOL_INFO*)buf;
+    si->SizeOfStruct = sizeof( SYMBOL_INFO );
+    si->MaxNameLen = 1024;
+
+    if( SymFromAddr( proc, ptr, nullptr, si ) == 0 )
+    {
+        memcpy( si->Name, "[unknown]", 10 );
+        si->NameLen = 9;
+    }
+
+    IMAGEHLP_LINE64 line;
+    DWORD displacement = 0;
+    line.SizeOfStruct = sizeof(IMAGEHLP_LINE64);
+
+    {
+        auto name = (char*)tracy_malloc(si->NameLen + 1);
+        memcpy(name, si->Name, si->NameLen);
+        name[si->NameLen] = '\0';
+
+        cb_data[write].name = name;
+
+        const char* filename;
+        if (SymGetLineFromAddr64(proc, ptr, &displacement, &line) == 0)
+        {
+            filename = "[unknown]";
+            cb_data[write].line = 0;
+        }
+        else
+        {
+            filename = line.FileName;
+            cb_data[write].line = line.LineNumber;
+        }
+
+        const auto fsz = strlen(filename);
+        auto file = (char*)tracy_malloc(fsz + 1);
+        memcpy(file, filename, fsz);
+        file[fsz] = '\0';
+
+        cb_data[write].file = file;
+    }
+
+#ifndef __CYGWIN__
+    if( doInline )
+    {
+        for( DWORD i=0; i<inlineNum; i++ )
+        {
+            auto& cb = cb_data[i];
+
+            if( SymFromInlineContext( proc, ptr, ctx, nullptr, si ) == 0 )
+            {
+                memcpy( si->Name, "[unknown]", 10 );
+                si->NameLen = 9;
+            }
+
+            auto name = (char*)tracy_malloc( si->NameLen + 1 );
+            memcpy( name, si->Name, si->NameLen );
+            name[si->NameLen] = '\0';
+            cb.name = name;
+
+            const char* filename;
+            if( SymGetLineFromInlineContext( proc, ptr, ctx, 0, &displacement, &line ) == 0 )
+            {
+                filename = "[unknown]";
+                cb.line = 0;
+            }
+            else
+            {
+                filename = line.FileName;
+                cb.line = line.LineNumber;
+            }
+
+            const auto fsz = strlen( filename );
+            auto file = (char*)tracy_malloc( fsz + 1 );
+            memcpy( file, filename, fsz );
+            file[fsz] = '\0';
+            cb.file = file;
+
+            ctx++;
+        }
+    }
+#endif
+
+    return { cb_data, uint8_t( cb_num ) };
+}
+
+#elif TRACY_HAS_CALLSTACK == 4
+
+void InitCallstack()
+{
+}
+
+const char* DecodeCallstackPtrFast( uint64_t ptr )
+{
+    static char ret[1024];
+    auto vptr = (void*)ptr;
+    char** sym = nullptr;
+    const char* symname = nullptr;
+    Dl_info dlinfo;
+    if( dladdr( vptr, &dlinfo ) && dlinfo.dli_sname )
+    {
+        symname = dlinfo.dli_sname;
+    }
+    else
+    {
+        sym = backtrace_symbols( &vptr, 1 );
+        if( sym )
+        {
+            symname = *sym;
+        }
+    }
+    if( symname )
+    {
+        strcpy( ret, symname );
+    }
+    else
+    {
+        *ret = '\0';
+    }
+    return ret;
+}
+
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
+{
+    static CallstackEntry cb;
+    cb.line = 0;
+
+    char* demangled = nullptr;
+    const char* symname = nullptr;
+    const char* symloc = nullptr;
+    auto vptr = (void*)ptr;
+    char** sym = nullptr;
+    ptrdiff_t symoff = 0;
+
+    Dl_info dlinfo;
+    if( dladdr( vptr, &dlinfo ) )
+    {
+        symloc = dlinfo.dli_fname;
+        symname = dlinfo.dli_sname;
+        symoff = (char*)ptr - (char*)dlinfo.dli_saddr;
+
+        if( symname && symname[0] == '_' )
+        {
+            size_t len = 0;
+            int status;
+            demangled = abi::__cxa_demangle( symname, nullptr, &len, &status );
+            if( status == 0 )
+            {
+                symname = demangled;
+            }
+        }
+    }
+
+    if( !symname )
+    {
+        sym = backtrace_symbols( &vptr, 1 );
+        if( !sym )
+        {
+            symname = "[unknown]";
+        }
+        else
+        {
+            symname = *sym;
+        }
+    }
+    if( !symloc )
+    {
+        symloc = "[unknown]";
+    }
+
+    if( symoff == 0 )
+    {
+        const auto namelen = strlen( symname );
+        auto name = (char*)tracy_malloc( namelen + 1 );
+        memcpy( name, symname, namelen );
+        name[namelen] = '\0';
+        cb.name = name;
+    }
+    else
+    {
+        char buf[32];
+        const auto offlen = sprintf( buf, " + %td", symoff );
+        const auto namelen = strlen( symname );
+        auto name = (char*)tracy_malloc( namelen + offlen + 1 );
+        memcpy( name, symname, namelen );
+        memcpy( name + namelen, buf, offlen );
+        name[namelen + offlen] = '\0';
+        cb.name = name;
+    }
+
+    char buf[32];
+    const auto addrlen = sprintf( buf, " [%p]", (void*)ptr );
+    const auto loclen = strlen( symloc );
+    auto loc = (char*)tracy_malloc( loclen + addrlen + 1 );
+    memcpy( loc, symloc, loclen );
+    memcpy( loc + loclen, buf, addrlen );
+    loc[loclen + addrlen] = '\0';
+    cb.file = loc;
+
+    if( sym ) free( sym );
+    if( demangled ) free( demangled );
+
+    return { &cb, 1 };
+}
+
+#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 6
+
+enum { MaxCbTrace = 16 };
+
+struct backtrace_state* cb_bts;
+int cb_num;
+CallstackEntry cb_data[MaxCbTrace];
+
+void InitCallstack()
+{
+    cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr );
+}
+
+static inline char* CopyString( const char* src )
+{
+    const auto sz = strlen( src );
+    auto dst = (char*)tracy_malloc( sz + 1 );
+    memcpy( dst, src, sz );
+    dst[sz] = '\0';
+    return dst;
+}
+
+static int FastCallstackDataCb( void* data, uintptr_t pc, const char* fn, int lineno, const char* function )
+{
+    if( function )
+    {
+        strcpy( (char*)data, function );
+    }
+    else
+    {
+        const char* symname = nullptr;
+        auto vptr = (void*)pc;
+        Dl_info dlinfo;
+        if( dladdr( vptr, &dlinfo ) )
+        {
+            symname = dlinfo.dli_sname;
+        }
+        if( symname )
+        {
+            strcpy( (char*)data, symname );
+        }
+        else
+        {
+            *(char*)data = '\0';
+        }
+    }
+    return 1;
+}
+
+static void FastCallstackErrorCb( void* data, const char* /*msg*/, int /*errnum*/ )
+{
+    *(char*)data = '\0';
+}
+
+const char* DecodeCallstackPtrFast( uint64_t ptr )
+{
+    static char ret[1024];
+    backtrace_pcinfo( cb_bts, ptr, FastCallstackDataCb, FastCallstackErrorCb, ret );
+    return ret;
+}
+
+static int CallstackDataCb( void* /*data*/, uintptr_t pc, const char* fn, int lineno, const char* function )
+{
+    enum { DemangleBufLen = 64*1024 };
+    char demangled[DemangleBufLen];
+
+    if( !fn && !function )
+    {
+        const char* symname = nullptr;
+        const char* symloc = nullptr;
+        auto vptr = (void*)pc;
+        ptrdiff_t symoff = 0;
+
+        Dl_info dlinfo;
+        if( dladdr( vptr, &dlinfo ) )
+        {
+            symloc = dlinfo.dli_fname;
+            symname = dlinfo.dli_sname;
+            symoff = (char*)pc - (char*)dlinfo.dli_saddr;
+
+            if( symname && symname[0] == '_' )
+            {
+                size_t len = DemangleBufLen;
+                int status;
+                abi::__cxa_demangle( symname, demangled, &len, &status );
+                if( status == 0 )
+                {
+                    symname = demangled;
+                }
+            }
+        }
+
+        if( !symname ) symname = "[unknown]";
+        if( !symloc ) symloc = "[unknown]";
+
+        if( symoff == 0 )
+        {
+            cb_data[cb_num].name = CopyString( symname );
+        }
+        else
+        {
+            char buf[32];
+            const auto offlen = sprintf( buf, " + %td", symoff );
+            const auto namelen = strlen( symname );
+            auto name = (char*)tracy_malloc( namelen + offlen + 1 );
+            memcpy( name, symname, namelen );
+            memcpy( name + namelen, buf, offlen );
+            name[namelen + offlen] = '\0';
+            cb_data[cb_num].name = name;
+        }
+
+        char buf[32];
+        const auto addrlen = sprintf( buf, " [%p]", (void*)pc );
+        const auto loclen = strlen( symloc );
+        auto loc = (char*)tracy_malloc( loclen + addrlen + 1 );
+        memcpy( loc, symloc, loclen );
+        memcpy( loc + loclen, buf, addrlen );
+        loc[loclen + addrlen] = '\0';
+        cb_data[cb_num].file = loc;
+
+        cb_data[cb_num].line = 0;
+    }
+    else
+    {
+        if( !fn ) fn = "[unknown]";
+        if( !function )
+        {
+            function = "[unknown]";
+        }
+        else
+        {
+            if( function[0] == '_' )
+            {
+                size_t len = DemangleBufLen;
+                int status;
+                abi::__cxa_demangle( function, demangled, &len, &status );
+                if( status == 0 )
+                {
+                    function = demangled;
+                }
+            }
+        }
+
+        cb_data[cb_num].name = CopyString( function );
+        cb_data[cb_num].file = CopyString( fn );
+        cb_data[cb_num].line = lineno;
+    }
+
+    if( ++cb_num >= MaxCbTrace )
+    {
+        return 1;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+static void CallstackErrorCb( void* /*data*/, const char* /*msg*/, int /*errnum*/ )
+{
+    for( int i=0; i<cb_num; i++ )
+    {
+        tracy_free( (void*)cb_data[i].name );
+        tracy_free( (void*)cb_data[i].file );
+    }
+
+    cb_data[0].name = CopyString( "[error]" );
+    cb_data[0].file = CopyString( "[error]" );
+    cb_data[0].line = 0;
+
+    cb_num = 1;
+}
+
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
+{
+    cb_num = 0;
+    backtrace_pcinfo( cb_bts, ptr, CallstackDataCb, CallstackErrorCb, nullptr );
+    assert( cb_num > 0 );
+    return { cb_data, uint8_t( cb_num ) };
+}
+
+#elif TRACY_HAS_CALLSTACK == 5
+
+void InitCallstack()
+{
+}
+
+const char* DecodeCallstackPtrFast( uint64_t ptr )
+{
+    static char ret[1024];
+    auto vptr = (void*)ptr;
+    char** sym = nullptr;
+    const char* symname = nullptr;
+    Dl_info dlinfo;
+    if( dladdr( vptr, &dlinfo ) && dlinfo.dli_sname )
+    {
+        symname = dlinfo.dli_sname;
+    }
+    if( symname )
+    {
+        strcpy( ret, symname );
+    }
+    else
+    {
+        *ret = '\0';
+    }
+    return ret;
+}
+
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
+{
+    static CallstackEntry cb;
+    cb.line = 0;
+
+    char* demangled = nullptr;
+    const char* symname = nullptr;
+    const char* symloc = nullptr;
+    auto vptr = (void*)ptr;
+    char** sym = nullptr;
+    ptrdiff_t symoff = 0;
+
+    Dl_info dlinfo;
+    if( dladdr( vptr, &dlinfo ) )
+    {
+        symloc = dlinfo.dli_fname;
+        symname = dlinfo.dli_sname;
+        symoff = (char*)ptr - (char*)dlinfo.dli_saddr;
+
+        if( symname && symname[0] == '_' )
+        {
+            size_t len = 0;
+            int status;
+            demangled = abi::__cxa_demangle( symname, nullptr, &len, &status );
+            if( status == 0 )
+            {
+                symname = demangled;
+            }
+        }
+    }
+
+    if( !symname )
+    {
+        symname = "[unknown]";
+    }
+    if( !symloc )
+    {
+        symloc = "[unknown]";
+    }
+
+    if( symoff == 0 )
+    {
+        const auto namelen = strlen( symname );
+        auto name = (char*)tracy_malloc( namelen + 1 );
+        memcpy( name, symname, namelen );
+        name[namelen] = '\0';
+        cb.name = name;
+    }
+    else
+    {
+        char buf[32];
+        const auto offlen = sprintf( buf, " + %td", symoff );
+        const auto namelen = strlen( symname );
+        auto name = (char*)tracy_malloc( namelen + offlen + 1 );
+        memcpy( name, symname, namelen );
+        memcpy( name + namelen, buf, offlen );
+        name[namelen + offlen] = '\0';
+        cb.name = name;
+    }
+
+    char buf[32];
+    const auto addrlen = sprintf( buf, " [%p]", (void*)ptr );
+    const auto loclen = strlen( symloc );
+    auto loc = (char*)tracy_malloc( loclen + addrlen + 1 );
+    memcpy( loc, symloc, loclen );
+    memcpy( loc + loclen, buf, addrlen );
+    loc[loclen + addrlen] = '\0';
+    cb.file = loc;
+
+    if( sym ) free( sym );
+    if( demangled ) free( demangled );
+
+    return { &cb, 1 };
+}
+
+#endif
+
+}
+
+#endif
diff --git a/libs/tracy/client/TracyCallstack.h b/libs/tracy/client/TracyCallstack.h
@@ -0,0 +1,29 @@
+#ifndef __TRACYCALLSTACK_H__
+#define __TRACYCALLSTACK_H__
+
+#if !defined _WIN32 && !defined __CYGWIN__
+#  include <sys/param.h>
+#endif
+
+#if defined _WIN32 || defined __CYGWIN__
+#  define TRACY_HAS_CALLSTACK 1
+#elif defined __ANDROID__
+#  if !defined __arm__ || __ANDROID_API__ >= 21
+#    define TRACY_HAS_CALLSTACK 2
+#  else
+#    define TRACY_HAS_CALLSTACK 5
+#  endif
+#elif defined __linux
+// XXX: diesel changes
+// #  if defined _GNU_SOURCE && defined __GLIBC__
+// #    define TRACY_HAS_CALLSTACK 3
+// #  else
+// #    define TRACY_HAS_CALLSTACK 2
+// #  endif
+#elif defined __APPLE__
+#  define TRACY_HAS_CALLSTACK 4
+#elif defined BSD
+#  define TRACY_HAS_CALLSTACK 6
+#endif
+
+#endif
diff --git a/libs/tracy/client/TracyCallstack.hpp b/libs/tracy/client/TracyCallstack.hpp
@@ -0,0 +1,112 @@
+#ifndef __TRACYCALLSTACK_HPP__
+#define __TRACYCALLSTACK_HPP__
+
+#include "TracyCallstack.h"
+
+#if TRACY_HAS_CALLSTACK == 1
+extern "C"
+{
+    typedef unsigned long (__stdcall *t_RtlWalkFrameChain)( void**, unsigned long, unsigned long );
+    extern t_RtlWalkFrameChain RtlWalkFrameChain;
+}
+#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5
+#  include <unwind.h>
+#elif TRACY_HAS_CALLSTACK >= 3
+#  include <execinfo.h>
+#endif
+
+
+#ifdef TRACY_HAS_CALLSTACK
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracyForceInline.hpp"
+
+namespace tracy
+{
+
+struct CallstackEntry
+{
+    const char* name;
+    const char* file;
+    uint32_t line;
+};
+
+struct CallstackEntryData
+{
+    const CallstackEntry* data;
+    uint8_t size;
+};
+
+const char* DecodeCallstackPtrFast( uint64_t ptr );
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr );
+void InitCallstack();
+
+#if TRACY_HAS_CALLSTACK == 1
+
+static tracy_force_inline void* Callstack( int depth )
+{
+    assert( depth >= 1 && depth < 63 );
+
+    auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
+    const auto num = RtlWalkFrameChain( (void**)( trace + 1 ), depth, 0 );
+    *trace = num;
+
+    return trace;
+}
+
+#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5
+
+struct BacktraceState
+{
+    void** current;
+    void** end;
+};
+
+static _Unwind_Reason_Code tracy_unwind_callback( struct _Unwind_Context* ctx, void* arg )
+{
+    auto state = (BacktraceState*)arg;
+    uintptr_t pc = _Unwind_GetIP( ctx );
+    if( pc )
+    {
+        if( state->current == state->end ) return _URC_END_OF_STACK;
+        *state->current++ = (void*)pc;
+    }
+    return _URC_NO_REASON;
+}
+
+static tracy_force_inline void* Callstack( int depth )
+{
+    assert( depth >= 1 && depth < 63 );
+
+    auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
+    BacktraceState state = { (void**)(trace+1), (void**)(trace+1+depth) };
+    _Unwind_Backtrace( tracy_unwind_callback, &state );
+
+    *trace = (uintptr_t*)state.current - trace + 1;
+
+    return trace;
+}
+
+#elif TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
+
+static tracy_force_inline void* Callstack( int depth )
+{
+    assert( depth >= 1 );
+
+    auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
+    const auto num = backtrace( (void**)(trace+1), depth );
+    *trace = num;
+
+    return trace;
+}
+
+#endif
+
+}
+
+#endif
+
+#endif
diff --git a/libs/tracy/client/TracyDxt1.cpp b/libs/tracy/client/TracyDxt1.cpp
@@ -0,0 +1,646 @@
+#include "TracyDxt1.hpp"
+#include "../common/TracyForceInline.hpp"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#endif
+
+#if defined __AVX__ && !defined __SSE4_1__
+#  define __SSE4_1__
+#endif
+
+#if defined __SSE4_1__ || defined __AVX2__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#  else
+#    include <x86intrin.h>
+#    ifdef __CYGWIN__
+#      ifndef _mm256_cvtsi256_si32
+#        define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) )
+#      endif
+#    endif
+#  endif
+#endif
+
+namespace tracy
+{
+
+static inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b )
+{
+    return ( ( r & 0xF8 ) << 8 ) | ( ( g & 0xFC ) << 3 ) | ( b >> 3 );
+}
+
+static inline uint16_t to565( uint32_t c )
+{
+    return
+        ( ( c & 0xF80000 ) >> 19 ) |
+        ( ( c & 0x00FC00 ) >> 5 ) |
+        ( ( c & 0x0000F8 ) << 8 );
+}
+
+static const uint16_t DivTable[255*3+1] = {
+    0xffff, 0xffff, 0xffff, 0xffff, 0xcccc, 0xaaaa, 0x9249, 0x8000, 0x71c7, 0x6666, 0x5d17, 0x5555, 0x4ec4, 0x4924, 0x4444, 0x4000,
+    0x3c3c, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
+    0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
+    0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
+    0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
+    0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
+    0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
+    0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
+    0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
+    0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
+    0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
+    0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
+    0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
+    0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
+    0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
+    0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
+    0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
+    0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
+    0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
+    0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
+    0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
+    0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
+    0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
+    0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
+    0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
+    0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
+    0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
+    0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
+    0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
+    0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
+    0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
+    0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
+    0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
+    0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
+    0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
+    0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
+    0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
+    0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
+    0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
+    0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
+    0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
+    0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
+    0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
+    0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
+    0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
+    0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
+    0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
+    0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
+};
+static const uint16_t DivTableAVX[255*3+1] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
+    0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
+    0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
+    0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
+    0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
+    0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
+    0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
+    0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
+    0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
+    0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
+    0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
+    0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
+    0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
+    0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
+    0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
+    0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
+    0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
+    0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
+    0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
+    0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
+    0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
+    0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
+    0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
+    0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
+    0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
+    0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
+    0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
+    0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
+    0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
+    0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
+    0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
+    0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
+    0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
+    0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
+    0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
+    0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
+    0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
+    0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
+    0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
+    0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
+    0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
+    0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
+    0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
+    0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
+    0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
+    0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
+    0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
+};
+static const uint16_t DivTableNEON[255*3+1] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x1c71, 0x1af2, 0x1999, 0x1861, 0x1745, 0x1642, 0x1555, 0x147a, 0x13b1, 0x12f6, 0x1249, 0x11a7, 0x1111, 0x1084, 0x1000,
+    0x0f83, 0x0f0f, 0x0ea0, 0x0e38, 0x0dd6, 0x0d79, 0x0d20, 0x0ccc, 0x0c7c, 0x0c30, 0x0be8, 0x0ba2, 0x0b60, 0x0b21, 0x0ae4, 0x0aaa,
+    0x0a72, 0x0a3d, 0x0a0a, 0x09d8, 0x09a9, 0x097b, 0x094f, 0x0924, 0x08fb, 0x08d3, 0x08ad, 0x0888, 0x0864, 0x0842, 0x0820, 0x0800,
+    0x07e0, 0x07c1, 0x07a4, 0x0787, 0x076b, 0x0750, 0x0736, 0x071c, 0x0703, 0x06eb, 0x06d3, 0x06bc, 0x06a6, 0x0690, 0x067b, 0x0666,
+    0x0652, 0x063e, 0x062b, 0x0618, 0x0606, 0x05f4, 0x05e2, 0x05d1, 0x05c0, 0x05b0, 0x05a0, 0x0590, 0x0581, 0x0572, 0x0563, 0x0555,
+    0x0547, 0x0539, 0x052b, 0x051e, 0x0511, 0x0505, 0x04f8, 0x04ec, 0x04e0, 0x04d4, 0x04c8, 0x04bd, 0x04b2, 0x04a7, 0x049c, 0x0492,
+    0x0487, 0x047d, 0x0473, 0x0469, 0x0460, 0x0456, 0x044d, 0x0444, 0x043b, 0x0432, 0x0429, 0x0421, 0x0418, 0x0410, 0x0408, 0x0400,
+    0x03f8, 0x03f0, 0x03e8, 0x03e0, 0x03d9, 0x03d2, 0x03ca, 0x03c3, 0x03bc, 0x03b5, 0x03ae, 0x03a8, 0x03a1, 0x039b, 0x0394, 0x038e,
+    0x0387, 0x0381, 0x037b, 0x0375, 0x036f, 0x0369, 0x0364, 0x035e, 0x0358, 0x0353, 0x034d, 0x0348, 0x0342, 0x033d, 0x0338, 0x0333,
+    0x032e, 0x0329, 0x0324, 0x031f, 0x031a, 0x0315, 0x0310, 0x030c, 0x0307, 0x0303, 0x02fe, 0x02fa, 0x02f5, 0x02f1, 0x02ec, 0x02e8,
+    0x02e4, 0x02e0, 0x02dc, 0x02d8, 0x02d4, 0x02d0, 0x02cc, 0x02c8, 0x02c4, 0x02c0, 0x02bc, 0x02b9, 0x02b5, 0x02b1, 0x02ae, 0x02aa,
+    0x02a7, 0x02a3, 0x02a0, 0x029c, 0x0299, 0x0295, 0x0292, 0x028f, 0x028c, 0x0288, 0x0285, 0x0282, 0x027f, 0x027c, 0x0279, 0x0276,
+    0x0273, 0x0270, 0x026d, 0x026a, 0x0267, 0x0264, 0x0261, 0x025e, 0x025c, 0x0259, 0x0256, 0x0253, 0x0251, 0x024e, 0x024b, 0x0249,
+    0x0246, 0x0243, 0x0241, 0x023e, 0x023c, 0x0239, 0x0237, 0x0234, 0x0232, 0x0230, 0x022d, 0x022b, 0x0229, 0x0226, 0x0224, 0x0222,
+    0x021f, 0x021d, 0x021b, 0x0219, 0x0216, 0x0214, 0x0212, 0x0210, 0x020e, 0x020c, 0x020a, 0x0208, 0x0206, 0x0204, 0x0202, 0x0200,
+    0x01fe, 0x01fc, 0x01fa, 0x01f8, 0x01f6, 0x01f4, 0x01f2, 0x01f0, 0x01ee, 0x01ec, 0x01ea, 0x01e9, 0x01e7, 0x01e5, 0x01e3, 0x01e1,
+    0x01e0, 0x01de, 0x01dc, 0x01da, 0x01d9, 0x01d7, 0x01d5, 0x01d4, 0x01d2, 0x01d0, 0x01cf, 0x01cd, 0x01cb, 0x01ca, 0x01c8, 0x01c7,
+    0x01c5, 0x01c3, 0x01c2, 0x01c0, 0x01bf, 0x01bd, 0x01bc, 0x01ba, 0x01b9, 0x01b7, 0x01b6, 0x01b4, 0x01b3, 0x01b2, 0x01b0, 0x01af,
+    0x01ad, 0x01ac, 0x01aa, 0x01a9, 0x01a8, 0x01a6, 0x01a5, 0x01a4, 0x01a2, 0x01a1, 0x01a0, 0x019e, 0x019d, 0x019c, 0x019a, 0x0199,
+    0x0198, 0x0197, 0x0195, 0x0194, 0x0193, 0x0192, 0x0190, 0x018f, 0x018e, 0x018d, 0x018b, 0x018a, 0x0189, 0x0188, 0x0187, 0x0186,
+    0x0184, 0x0183, 0x0182, 0x0181, 0x0180, 0x017f, 0x017e, 0x017d, 0x017b, 0x017a, 0x0179, 0x0178, 0x0177, 0x0176, 0x0175, 0x0174,
+    0x0173, 0x0172, 0x0171, 0x0170, 0x016f, 0x016e, 0x016d, 0x016c, 0x016b, 0x016a, 0x0169, 0x0168, 0x0167, 0x0166, 0x0165, 0x0164,
+    0x0163, 0x0162, 0x0161, 0x0160, 0x015f, 0x015e, 0x015d, 0x015c, 0x015b, 0x015a, 0x0159, 0x0158, 0x0158, 0x0157, 0x0156, 0x0155,
+    0x0154, 0x0153, 0x0152, 0x0151, 0x0150, 0x0150, 0x014f, 0x014e, 0x014d, 0x014c, 0x014b, 0x014a, 0x014a, 0x0149, 0x0148, 0x0147,
+    0x0146, 0x0146, 0x0145, 0x0144, 0x0143, 0x0142, 0x0142, 0x0141, 0x0140, 0x013f, 0x013e, 0x013e, 0x013d, 0x013c, 0x013b, 0x013b,
+    0x013a, 0x0139, 0x0138, 0x0138, 0x0137, 0x0136, 0x0135, 0x0135, 0x0134, 0x0133, 0x0132, 0x0132, 0x0131, 0x0130, 0x0130, 0x012f,
+    0x012e, 0x012e, 0x012d, 0x012c, 0x012b, 0x012b, 0x012a, 0x0129, 0x0129, 0x0128, 0x0127, 0x0127, 0x0126, 0x0125, 0x0125, 0x0124,
+    0x0123, 0x0123, 0x0122, 0x0121, 0x0121, 0x0120, 0x0120, 0x011f, 0x011e, 0x011e, 0x011d, 0x011c, 0x011c, 0x011b, 0x011b, 0x011a,
+    0x0119, 0x0119, 0x0118, 0x0118, 0x0117, 0x0116, 0x0116, 0x0115, 0x0115, 0x0114, 0x0113, 0x0113, 0x0112, 0x0112, 0x0111, 0x0111,
+    0x0110, 0x010f, 0x010f, 0x010e, 0x010e, 0x010d, 0x010d, 0x010c, 0x010c, 0x010b, 0x010a, 0x010a, 0x0109, 0x0109, 0x0108, 0x0108,
+    0x0107, 0x0107, 0x0106, 0x0106, 0x0105, 0x0105, 0x0104, 0x0104, 0x0103, 0x0103, 0x0102, 0x0102, 0x0101, 0x0101, 0x0100, 0x0100,
+    0x00ff, 0x00ff, 0x00fe, 0x00fe, 0x00fd, 0x00fd, 0x00fc, 0x00fc, 0x00fb, 0x00fb, 0x00fa, 0x00fa, 0x00f9, 0x00f9, 0x00f8, 0x00f8,
+    0x00f7, 0x00f7, 0x00f6, 0x00f6, 0x00f5, 0x00f5, 0x00f4, 0x00f4, 0x00f4, 0x00f3, 0x00f3, 0x00f2, 0x00f2, 0x00f1, 0x00f1, 0x00f0,
+    0x00f0, 0x00f0, 0x00ef, 0x00ef, 0x00ee, 0x00ee, 0x00ed, 0x00ed, 0x00ed, 0x00ec, 0x00ec, 0x00eb, 0x00eb, 0x00ea, 0x00ea, 0x00ea,
+    0x00e9, 0x00e9, 0x00e8, 0x00e8, 0x00e7, 0x00e7, 0x00e7, 0x00e6, 0x00e6, 0x00e5, 0x00e5, 0x00e5, 0x00e4, 0x00e4, 0x00e3, 0x00e3,
+    0x00e3, 0x00e2, 0x00e2, 0x00e1, 0x00e1, 0x00e1, 0x00e0, 0x00e0, 0x00e0, 0x00df, 0x00df, 0x00de, 0x00de, 0x00de, 0x00dd, 0x00dd,
+    0x00dd, 0x00dc, 0x00dc, 0x00db, 0x00db, 0x00db, 0x00da, 0x00da, 0x00da, 0x00d9, 0x00d9, 0x00d9, 0x00d8, 0x00d8, 0x00d7, 0x00d7,
+    0x00d7, 0x00d6, 0x00d6, 0x00d6, 0x00d5, 0x00d5, 0x00d5, 0x00d4, 0x00d4, 0x00d4, 0x00d3, 0x00d3, 0x00d3, 0x00d2, 0x00d2, 0x00d2,
+    0x00d1, 0x00d1, 0x00d1, 0x00d0, 0x00d0, 0x00d0, 0x00cf, 0x00cf, 0x00cf, 0x00ce, 0x00ce, 0x00ce, 0x00cd, 0x00cd, 0x00cd, 0x00cc,
+    0x00cc, 0x00cc, 0x00cb, 0x00cb, 0x00cb, 0x00ca, 0x00ca, 0x00ca, 0x00c9, 0x00c9, 0x00c9, 0x00c9, 0x00c8, 0x00c8, 0x00c8, 0x00c7,
+    0x00c7, 0x00c7, 0x00c6, 0x00c6, 0x00c6, 0x00c5, 0x00c5, 0x00c5, 0x00c5, 0x00c4, 0x00c4, 0x00c4, 0x00c3, 0x00c3, 0x00c3, 0x00c3,
+    0x00c2, 0x00c2, 0x00c2, 0x00c1, 0x00c1, 0x00c1, 0x00c1, 0x00c0, 0x00c0, 0x00c0, 0x00bf, 0x00bf, 0x00bf, 0x00bf, 0x00be, 0x00be,
+    0x00be, 0x00bd, 0x00bd, 0x00bd, 0x00bd, 0x00bc, 0x00bc, 0x00bc, 0x00bc, 0x00bb, 0x00bb, 0x00bb, 0x00ba, 0x00ba, 0x00ba, 0x00ba,
+    0x00b9, 0x00b9, 0x00b9, 0x00b9, 0x00b8, 0x00b8, 0x00b8, 0x00b8, 0x00b7, 0x00b7, 0x00b7, 0x00b7, 0x00b6, 0x00b6, 0x00b6, 0x00b6,
+    0x00b5, 0x00b5, 0x00b5, 0x00b5, 0x00b4, 0x00b4, 0x00b4, 0x00b4, 0x00b3, 0x00b3, 0x00b3, 0x00b3, 0x00b2, 0x00b2, 0x00b2, 0x00b2,
+    0x00b1, 0x00b1, 0x00b1, 0x00b1, 0x00b0, 0x00b0, 0x00b0, 0x00b0, 0x00af, 0x00af, 0x00af, 0x00af, 0x00ae, 0x00ae, 0x00ae, 0x00ae,
+    0x00ae, 0x00ad, 0x00ad, 0x00ad, 0x00ad, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ab, 0x00ab, 0x00ab, 0x00ab,
+};
+
+
+static tracy_force_inline uint64_t ProcessRGB( const uint8_t* src )
+{
+#ifdef __SSE4_1__
+    __m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
+    __m128i sd0 = _mm_and_si128( px0, smask );
+    __m128i sd1 = _mm_and_si128( px1, smask );
+    __m128i sd2 = _mm_and_si128( px2, smask );
+    __m128i sd3 = _mm_and_si128( px3, smask );
+
+    __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
+    __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
+    __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
+    __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
+
+    __m128i sm0 = _mm_and_si128(sc0, sc1);
+    __m128i sm1 = _mm_and_si128(sc2, sc3);
+    __m128i sm = _mm_and_si128(sm0, sm1);
+
+    if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
+    {
+        return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
+    }
+
+    __m128i min0 = _mm_min_epu8( px0, px1 );
+    __m128i min1 = _mm_min_epu8( px2, px3 );
+    __m128i min2 = _mm_min_epu8( min0, min1 );
+
+    __m128i max0 = _mm_max_epu8( px0, px1 );
+    __m128i max1 = _mm_max_epu8( px2, px3 );
+    __m128i max2 = _mm_max_epu8( max0, max1 );
+
+    __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i min4 = _mm_min_epu8( min2, min3 );
+    __m128i max4 = _mm_max_epu8( max2, max3 );
+
+    __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i rmin = _mm_min_epu8( min4, min5 );
+    __m128i rmax = _mm_max_epu8( max4, max5 );
+
+    __m128i range1 = _mm_subs_epu8( rmax, rmin );
+    __m128i range2 = _mm_sad_epu8( rmax, rmin );
+
+    uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
+    __m128i range = _mm_set1_epi16( DivTable[vrange] );
+
+    __m128i inset1 = _mm_srli_epi16( range1, 4 );
+    __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
+    __m128i min = _mm_adds_epu8( rmin, inset );
+    __m128i max = _mm_subs_epu8( rmax, inset );
+
+    __m128i c0 = _mm_subs_epu8( px0, rmin );
+    __m128i c1 = _mm_subs_epu8( px1, rmin );
+    __m128i c2 = _mm_subs_epu8( px2, rmin );
+    __m128i c3 = _mm_subs_epu8( px3, rmin );
+
+    __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
+    __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
+    __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
+    __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
+
+    __m128i s0 = _mm_hadd_epi16( is0, is1 );
+    __m128i s1 = _mm_hadd_epi16( is2, is3 );
+
+    __m128i m0 = _mm_mulhi_epu16( s0, range );
+    __m128i m1 = _mm_mulhi_epu16( s1, range );
+
+    __m128i p0 = _mm_packus_epi16( m0, m1 );
+
+    __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
+    __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
+    __m128i p3 = _mm_or_si128( p1, p2 );
+    __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
+
+    uint32_t vmin = _mm_cvtsi128_si32( min );
+    uint32_t vmax = _mm_cvtsi128_si32( max );
+    uint32_t vp = _mm_cvtsi128_si32( p );
+
+    return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
+#elif defined __ARM_NEON
+#  ifdef __aarch64__
+    uint8x16x4_t px = vld4q_u8( src );
+
+    uint8x16_t lr = px.val[0];
+    uint8x16_t lg = px.val[1];
+    uint8x16_t lb = px.val[2];
+
+    uint8_t rmaxr = vmaxvq_u8( lr );
+    uint8_t rmaxg = vmaxvq_u8( lg );
+    uint8_t rmaxb = vmaxvq_u8( lb );
+
+    uint8_t rminr = vminvq_u8( lr );
+    uint8_t rming = vminvq_u8( lg );
+    uint8_t rminb = vminvq_u8( lb );
+
+    int rr = rmaxr - rminr;
+    int rg = rmaxg - rming;
+    int rb = rmaxb - rminb;
+
+    int vrange1 = rr + rg + rb;
+    uint16_t vrange2 = DivTableNEON[vrange1];
+
+    uint8_t insetr = rr >> 4;
+    uint8_t insetg = rg >> 4;
+    uint8_t insetb = rb >> 4;
+
+    uint8_t minr = rminr + insetr;
+    uint8_t ming = rming + insetg;
+    uint8_t minb = rminb + insetb;
+
+    uint8_t maxr = rmaxr - insetr;
+    uint8_t maxg = rmaxg - insetg;
+    uint8_t maxb = rmaxb - insetb;
+
+    uint8x16_t cr = vsubq_u8( lr, vdupq_n_u8( rminr ) );
+    uint8x16_t cg = vsubq_u8( lg, vdupq_n_u8( rming ) );
+    uint8x16_t cb = vsubq_u8( lb, vdupq_n_u8( rminb ) );
+
+    uint16x8_t is0l = vaddl_u8( vget_low_u8( cr ), vget_low_u8( cg ) );
+    uint16x8_t is0h = vaddl_u8( vget_high_u8( cr ), vget_high_u8( cg ) );
+    uint16x8_t is1l = vaddw_u8( is0l, vget_low_u8( cb ) );
+    uint16x8_t is1h = vaddw_u8( is0h, vget_high_u8( cb ) );
+
+    int16x8_t range = vdupq_n_s16( vrange2 );
+    uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1l ), range ) );
+    uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1h ), range ) );
+
+    uint8x8_t p00 = vmovn_u16( m0 );
+    uint8x8_t p01 = vmovn_u16( m1 );
+    uint8x16_t p0 = vcombine_u8( p00, p01 );
+
+    uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
+    uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
+    uint32x4_t p3 = vaddq_u32( p1, p2 );
+
+    uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
+    uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
+
+    uint32_t vp;
+    vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
+
+    return uint64_t( ( uint64_t( to565( minr, ming, minb ) ) << 16 ) | to565( maxr, maxg, maxb ) | ( uint64_t( vp ) << 32 ) );
+#  else
+    uint32x4_t px0 = vld1q_u32( (uint32_t*)src );
+    uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 );
+    uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 );
+    uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 );
+
+    uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 );
+    uint32x4_t sd0 = vandq_u32( smask, px0 );
+    uint32x4_t sd1 = vandq_u32( smask, px1 );
+    uint32x4_t sd2 = vandq_u32( smask, px2 );
+    uint32x4_t sd3 = vandq_u32( smask, px3 );
+
+    uint32x4_t sc = vdupq_n_u32( sd0[0] );
+
+    uint32x4_t sc0 = vceqq_u32( sd0, sc );
+    uint32x4_t sc1 = vceqq_u32( sd1, sc );
+    uint32x4_t sc2 = vceqq_u32( sd2, sc );
+    uint32x4_t sc3 = vceqq_u32( sd3, sc );
+
+    uint32x4_t sm0 = vandq_u32( sc0, sc1 );
+    uint32x4_t sm1 = vandq_u32( sc2, sc3 );
+    int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) );
+
+    if( sm[0] == -1 && sm[1] == -1 )
+    {
+        return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
+    }
+
+    uint32x4_t mask = vdupq_n_u32( 0xFFFFFF );
+    uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) );
+    uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) );
+    uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) );
+    uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) );
+
+    uint8x16_t min0 = vminq_u8( l0, l1 );
+    uint8x16_t min1 = vminq_u8( l2, l3 );
+    uint8x16_t min2 = vminq_u8( min0, min1 );
+
+    uint8x16_t max0 = vmaxq_u8( l0, l1 );
+    uint8x16_t max1 = vmaxq_u8( l2, l3 );
+    uint8x16_t max2 = vmaxq_u8( max0, max1 );
+
+    uint8x16_t min3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( min2 ) ) );
+    uint8x16_t max3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( max2 ) ) );
+
+    uint8x16_t min4 = vminq_u8( min2, min3 );
+    uint8x16_t max4 = vmaxq_u8( max2, max3 );
+
+    uint8x16_t min5 = vcombine_u8( vget_high_u8( min4 ), vget_low_u8( min4 ) );
+    uint8x16_t max5 = vcombine_u8( vget_high_u8( max4 ), vget_low_u8( max4 ) );
+
+    uint8x16_t rmin = vminq_u8( min4, min5 );
+    uint8x16_t rmax = vmaxq_u8( max4, max5 );
+
+    uint8x16_t range1 = vsubq_u8( rmax, rmin );
+    uint8x8_t range2 = vget_low_u8( range1 );
+    uint8x8x2_t range3 = vzip_u8( range2, vdup_n_u8( 0 ) );
+    uint16x4_t range4 = vreinterpret_u16_u8( range3.val[0] );
+
+    uint16_t vrange1;
+    uint16x4_t range5 = vpadd_u16( range4, range4 );
+    uint16x4_t range6 = vpadd_u16( range5, range5 );
+    vst1_lane_u16( &vrange1, range6, 0 );
+
+    uint32_t vrange2 = ( 2 << 16 ) / uint32_t( vrange1 + 1 );
+    uint16x8_t range = vdupq_n_u16( vrange2 );
+
+    uint8x16_t inset = vshrq_n_u8( range1, 4 );
+    uint8x16_t min = vaddq_u8( rmin, inset );
+    uint8x16_t max = vsubq_u8( rmax, inset );
+
+    uint8x16_t c0 = vsubq_u8( l0, rmin );
+    uint8x16_t c1 = vsubq_u8( l1, rmin );
+    uint8x16_t c2 = vsubq_u8( l2, rmin );
+    uint8x16_t c3 = vsubq_u8( l3, rmin );
+
+    uint16x8_t is0 = vpaddlq_u8( c0 );
+    uint16x8_t is1 = vpaddlq_u8( c1 );
+    uint16x8_t is2 = vpaddlq_u8( c2 );
+    uint16x8_t is3 = vpaddlq_u8( c3 );
+
+    uint16x4_t is4 = vpadd_u16( vget_low_u16( is0 ), vget_high_u16( is0 ) );
+    uint16x4_t is5 = vpadd_u16( vget_low_u16( is1 ), vget_high_u16( is1 ) );
+    uint16x4_t is6 = vpadd_u16( vget_low_u16( is2 ), vget_high_u16( is2 ) );
+    uint16x4_t is7 = vpadd_u16( vget_low_u16( is3 ), vget_high_u16( is3 ) );
+
+    uint16x8_t s0 = vcombine_u16( is4, is5 );
+    uint16x8_t s1 = vcombine_u16( is6, is7 );
+
+    uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s0 ), vreinterpretq_s16_u16( range ) ) );
+    uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s1 ), vreinterpretq_s16_u16( range ) ) );
+
+    uint8x8_t p00 = vmovn_u16( m0 );
+    uint8x8_t p01 = vmovn_u16( m1 );
+    uint8x16_t p0 = vcombine_u8( p00, p01 );
+
+    uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
+    uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
+    uint32x4_t p3 = vaddq_u32( p1, p2 );
+
+    uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
+    uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
+
+    uint32_t vmin, vmax, vp;
+    vst1q_lane_u32( &vmin, vreinterpretq_u32_u8( min ), 0 );
+    vst1q_lane_u32( &vmax, vreinterpretq_u32_u8( max ), 0 );
+    vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
+
+    return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
+#  endif
+#else
+    const auto ref = to565( src[0], src[1], src[2] );
+    auto stmp = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        if( to565( stmp[0], stmp[1], stmp[2] ) != ref )
+        {
+            break;
+        }
+        stmp += 4;
+    }
+    if( stmp == src + 64 )
+    {
+        return uint64_t( ref ) << 16;
+    }
+
+    uint8_t min[3] = { src[0], src[1], src[2] };
+    uint8_t max[3] = { src[0], src[1], src[2] };
+    auto tmp = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        for( int j=0; j<3; j++ )
+        {
+            if( tmp[j] < min[j] ) min[j] = tmp[j];
+            else if( tmp[j] > max[j] ) max[j] = tmp[j];
+        }
+        tmp += 4;
+    }
+
+    const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]];
+    const uint32_t rmin = min[0] + min[1] + min[2];
+    for( int i=0; i<3; i++ )
+    {
+        const uint8_t inset = ( max[i] - min[i] ) >> 4;
+        min[i] += inset;
+        max[i] -= inset;
+    }
+
+    uint32_t data = 0;
+    for( int i=0; i<16; i++ )
+    {
+        const uint32_t c = src[0] + src[1] + src[2] - rmin;
+        const uint8_t idx = ( c * range ) >> 16;
+        data |= idx << (i*2);
+        src += 4;
+    }
+
+    return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) );
+#endif
+}
+
+#ifdef __AVX2__
+static tracy_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst )
+{
+    __m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0);
+    __m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1);
+    __m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2);
+    __m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3);
+
+    __m256i min0 = _mm256_min_epu8( px0, px1 );
+    __m256i min1 = _mm256_min_epu8( px2, px3 );
+    __m256i min2 = _mm256_min_epu8( min0, min1 );
+
+    __m256i max0 = _mm256_max_epu8( px0, px1 );
+    __m256i max1 = _mm256_max_epu8( px2, px3 );
+    __m256i max2 = _mm256_max_epu8( max0, max1 );
+
+    __m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m256i min4 = _mm256_min_epu8( min2, min3 );
+    __m256i max4 = _mm256_max_epu8( max2, max3 );
+
+    __m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m256i rmin = _mm256_min_epu8( min4, min5 );
+    __m256i rmax = _mm256_max_epu8( max4, max5 );
+
+    __m256i range1 = _mm256_subs_epu8( rmax, rmin );
+    __m256i range2 = _mm256_sad_epu8( rmax, rmin );
+
+    uint16_t vrange0 = DivTableAVX[_mm256_cvtsi256_si32( range2 ) >> 1];
+    uint16_t vrange1 = DivTableAVX[_mm256_extract_epi16( range2, 8 ) >> 1];
+    __m256i range00 = _mm256_set1_epi16( vrange0 );
+    __m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 );
+
+    __m256i inset1 = _mm256_srli_epi16( range1, 4 );
+    __m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) );
+    __m256i min = _mm256_adds_epu8( rmin, inset );
+    __m256i max = _mm256_subs_epu8( rmax, inset );
+
+    __m256i c0 = _mm256_subs_epu8( px0, rmin );
+    __m256i c1 = _mm256_subs_epu8( px1, rmin );
+    __m256i c2 = _mm256_subs_epu8( px2, rmin );
+    __m256i c3 = _mm256_subs_epu8( px3, rmin );
+
+    __m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) );
+    __m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) );
+    __m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) );
+    __m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) );
+
+    __m256i s0 = _mm256_hadd_epi16( is0, is1 );
+    __m256i s1 = _mm256_hadd_epi16( is2, is3 );
+
+    __m256i m0 = _mm256_mulhi_epu16( s0, range );
+    __m256i m1 = _mm256_mulhi_epu16( s1, range );
+
+    __m256i p0 = _mm256_packus_epi16( m0, m1 );
+
+    __m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) );
+    __m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 );
+    __m256i p3 = _mm256_or_si256( p1, p2 );
+    __m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) );
+
+    __m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min );
+    __m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max );
+    __m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 );
+    __m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 );
+    __m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 );
+    __m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 );
+    __m256i mm3 = _mm256_or_si256( mmr, mmg );
+    __m256i mm4 = _mm256_or_si256( mm3, mmb );
+    __m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) );
+
+    __m256i d0 = _mm256_unpacklo_epi32( mm5, p );
+    __m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) );
+    _mm_storeu_si128( (__m128i*)dst, _mm256_castsi256_si128( d1 ) );
+    dst += 16;
+}
+#endif
+
+void CompressImageDxt1( const char* src, char* dst, int w, int h )
+{
+    assert( (w % 4) == 0 && (h % 4) == 0 );
+
+#ifdef __AVX2__
+    if( w%8 == 0 )
+    {
+        uint32_t buf[8*4];
+        int i = 0;
+
+        auto blocks = w * h / 32;
+        do
+        {
+            auto tmp = (char*)buf;
+            memcpy( tmp,        src,          8*4 );
+            memcpy( tmp + 8*4,  src + w * 4,  8*4 );
+            memcpy( tmp + 16*4, src + w * 8,  8*4 );
+            memcpy( tmp + 24*4, src + w * 12, 8*4 );
+            src += 8*4;
+            if( ++i == w/8 )
+            {
+                src += w * 3 * 4;
+                i = 0;
+            }
+
+            ProcessRGB_AVX( (uint8_t*)buf, dst );
+        }
+        while( --blocks );
+    }
+    else
+#endif
+    {
+        uint32_t buf[4*4];
+        int i = 0;
+
+        auto ptr = dst;
+        auto blocks = w * h / 16;
+        do
+        {
+            auto tmp = (char*)buf;
+            memcpy( tmp,        src,          4*4 );
+            memcpy( tmp + 4*4,  src + w * 4,  4*4 );
+            memcpy( tmp + 8*4,  src + w * 8,  4*4 );
+            memcpy( tmp + 12*4, src + w * 12, 4*4 );
+            src += 4*4;
+            if( ++i == w/4 )
+            {
+                src += w * 3 * 4;
+                i = 0;
+            }
+
+            const auto c = ProcessRGB( (uint8_t*)buf );
+            memcpy( ptr, &c, sizeof( uint64_t ) );
+            ptr += sizeof( uint64_t );
+        }
+        while( --blocks );
+    }
+}
+
+}
diff --git a/libs/tracy/client/TracyDxt1.hpp b/libs/tracy/client/TracyDxt1.hpp
@@ -0,0 +1,11 @@
+#ifndef __TRACYDXT1_HPP__
+#define __TRACYDXT1_HPP__
+
+namespace tracy
+{
+
+void CompressImageDxt1( const char* src, char* dst, int w, int h );
+
+}
+
+#endif
diff --git a/libs/tracy/client/TracyFastVector.hpp b/libs/tracy/client/TracyFastVector.hpp
@@ -0,0 +1,116 @@
+#ifndef __TRACYFASTVECTOR_HPP__
+#define __TRACYFASTVECTOR_HPP__
+
+#include <stddef.h>
+
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracyForceInline.hpp"
+
+namespace tracy
+{
+
+template<typename T>
+class FastVector
+{
+public:
+    using iterator = T*;
+    using const_iterator = const T*;
+
+    FastVector( size_t capacity )
+        : m_ptr( (T*)tracy_malloc( sizeof( T ) * capacity ) )
+        , m_write( m_ptr )
+        , m_end( m_ptr + capacity )
+    {
+    }
+
+    FastVector( const FastVector& ) = delete;
+    FastVector( FastVector&& ) = delete;
+
+    ~FastVector()
+    {
+        tracy_free( m_ptr );
+    }
+
+    FastVector& operator=( const FastVector& ) = delete;
+    FastVector& operator=( FastVector&& ) = delete;
+
+    bool empty() const { return m_ptr == m_write; }
+    size_t size() const { return m_write - m_ptr; }
+
+    T* data() { return m_ptr; }
+    const T* data() const { return m_ptr; };
+
+    T* begin() { return m_ptr; }
+    const T* begin() const { return m_ptr; }
+    T* end() { return m_write; }
+    const T* end() const { return m_write; }
+
+    T& front() { assert( !empty() ); return m_ptr[0]; }
+    const T& front() const { assert( !empty() ); return m_ptr[0]; }
+
+    T& back() { assert( !empty() ); return m_write[-1]; }
+    const T& back() const { assert( !empty() ); return m_write[-1]; }
+
+    T& operator[]( size_t idx ) { return m_ptr[idx]; }
+    const T& operator[]( size_t idx ) const { return m_ptr[idx]; }
+
+    T* push_next()
+    {
+        if( m_write == m_end ) AllocMore();
+        return m_write++;
+    }
+
+    T* prepare_next()
+    {
+        if( m_write == m_end ) AllocMore();
+        return m_write;
+    }
+
+    void commit_next()
+    {
+        m_write++;
+    }
+
+    void clear()
+    {
+        m_write = m_ptr;
+    }
+
+    void swap( FastVector& vec )
+    {
+        const auto ptr1 = m_ptr;
+        const auto ptr2 = vec.m_ptr;
+        const auto write1 = m_write;
+        const auto write2 = vec.m_write;
+        const auto end1 = m_end;
+        const auto end2 = vec.m_end;
+
+        m_ptr = ptr2;
+        vec.m_ptr = ptr1;
+        m_write = write2;
+        vec.m_write = write1;
+        m_end = end2;
+        vec.m_end = end1;
+    }
+
+private:
+    tracy_no_inline void AllocMore()
+    {
+        const auto cap = ( m_end - m_ptr ) * 2;
+        const auto size = m_write - m_ptr;
+        T* ptr = (T*)tracy_malloc( sizeof( T ) * cap );
+        memcpy( ptr, m_ptr, size * sizeof( T ) );
+        tracy_free( m_ptr );
+        m_ptr = ptr;
+        m_write = m_ptr + size;
+        m_end = m_ptr + cap;
+    }
+
+    T* m_ptr;
+    T* m_write;
+    T* m_end;
+};
+
+}
+
+#endif
diff --git a/libs/tracy/client/TracyLock.hpp b/libs/tracy/client/TracyLock.hpp
@@ -0,0 +1,527 @@
+#ifndef __TRACYLOCK_HPP__
+#define __TRACYLOCK_HPP__
+
+#include <atomic>
+#include <limits>
+
+#include "../common/TracySystem.hpp"
+#include "../common/TracyAlign.hpp"
+#include "TracyProfiler.hpp"
+
+namespace tracy
+{
+
+class LockableCtx
+{
+public:
+    tracy_force_inline LockableCtx( const SourceLocationData* srcloc )
+        : m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) )
+#ifdef TRACY_ON_DEMAND
+        , m_lockCount( 0 )
+        , m_active( false )
+#endif
+    {
+        assert( m_id != std::numeric_limits<uint32_t>::max() );
+
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::LockAnnounce );
+        MemWrite( &item->lockAnnounce.id, m_id );
+        MemWrite( &item->lockAnnounce.time, Profiler::GetTime() );
+        MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc );
+        MemWrite( &item->lockAnnounce.type, LockType::Lockable );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    LockableCtx( const LockableCtx& ) = delete;
+    LockableCtx& operator=( const LockableCtx& ) = delete;
+
+    tracy_force_inline ~LockableCtx()
+    {
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::LockTerminate );
+        MemWrite( &item->lockTerminate.id, m_id );
+        MemWrite( &item->lockTerminate.time, Profiler::GetTime() );
+        MemWrite( &item->lockTerminate.type, LockType::Lockable );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    tracy_force_inline bool BeforeLock()
+    {
+#ifdef TRACY_ON_DEMAND
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return false;
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockWait );
+        MemWrite( &item->lockWait.thread, GetThreadHandle() );
+        MemWrite( &item->lockWait.id, m_id );
+        MemWrite( &item->lockWait.time, Profiler::GetTime() );
+        MemWrite( &item->lockWait.type, LockType::Lockable );
+        Profiler::QueueSerialFinish();
+        return true;
+    }
+
+    tracy_force_inline void AfterLock()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockObtain );
+        MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+        MemWrite( &item->lockObtain.id, m_id );
+        MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterUnlock()
+    {
+#ifdef TRACY_ON_DEMAND
+        m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
+        if( !m_active.load( std::memory_order_relaxed ) ) return;
+        if( !GetProfiler().IsConnected() )
+        {
+            m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockRelease );
+        MemWrite( &item->lockRelease.thread, GetThreadHandle() );
+        MemWrite( &item->lockRelease.id, m_id );
+        MemWrite( &item->lockRelease.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterTryLock( bool acquired )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !acquired ) return;
+
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return;
+#endif
+
+        if( acquired )
+        {
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::LockObtain );
+            MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+            MemWrite( &item->lockObtain.id, m_id );
+            MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+            Profiler::QueueSerialFinish();
+        }
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+#ifdef TRACY_ON_DEMAND
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( !active ) return;
+        const auto connected = GetProfiler().IsConnected();
+        if( !connected )
+        {
+            if( active ) m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockMark );
+        MemWrite( &item->lockMark.thread, GetThreadHandle() );
+        MemWrite( &item->lockMark.id, m_id );
+        MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc );
+        Profiler::QueueSerialFinish();
+    }
+
+private:
+    uint32_t m_id;
+
+#ifdef TRACY_ON_DEMAND
+    std::atomic<uint32_t> m_lockCount;
+    std::atomic<bool> m_active;
+#endif
+};
+
+template<class T>
+class Lockable
+{
+public:
+    tracy_force_inline Lockable( const SourceLocationData* srcloc )
+        : m_ctx( srcloc )
+    {
+    }
+
+    Lockable( const Lockable& ) = delete;
+    Lockable& operator=( const Lockable& ) = delete;
+
+    tracy_force_inline void lock()
+    {
+        const auto runAfter = m_ctx.BeforeLock();
+        m_lockable.lock();
+        if( runAfter ) m_ctx.AfterLock();
+    }
+
+    tracy_force_inline void unlock()
+    {
+        m_lockable.unlock();
+        m_ctx.AfterUnlock();
+    }
+
+    tracy_force_inline bool try_lock()
+    {
+        const auto acquired = m_lockable.try_lock();
+        m_ctx.AfterTryLock( acquired );
+        return acquired;
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+        m_ctx.Mark( srcloc );
+    }
+
+private:
+    T m_lockable;
+    LockableCtx m_ctx;
+};
+
+
+class SharedLockableCtx
+{
+public:
+    tracy_force_inline SharedLockableCtx( const SourceLocationData* srcloc )
+        : m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) )
+#ifdef TRACY_ON_DEMAND
+        , m_lockCount( 0 )
+        , m_active( false )
+#endif
+    {
+        assert( m_id != std::numeric_limits<uint32_t>::max() );
+
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::LockAnnounce );
+        MemWrite( &item->lockAnnounce.id, m_id );
+        MemWrite( &item->lockAnnounce.time, Profiler::GetTime() );
+        MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc );
+        MemWrite( &item->lockAnnounce.type, LockType::SharedLockable );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    SharedLockableCtx( const SharedLockableCtx& ) = delete;
+    SharedLockableCtx& operator=( const SharedLockableCtx& ) = delete;
+
+    tracy_force_inline ~SharedLockableCtx()
+    {
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::LockTerminate );
+        MemWrite( &item->lockTerminate.id, m_id );
+        MemWrite( &item->lockTerminate.time, Profiler::GetTime() );
+        MemWrite( &item->lockTerminate.type, LockType::SharedLockable );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    tracy_force_inline bool BeforeLock()
+    {
+#ifdef TRACY_ON_DEMAND
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return false;
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockWait );
+        MemWrite( &item->lockWait.thread, GetThreadHandle() );
+        MemWrite( &item->lockWait.id, m_id );
+        MemWrite( &item->lockWait.time, Profiler::GetTime() );
+        MemWrite( &item->lockWait.type, LockType::SharedLockable );
+        Profiler::QueueSerialFinish();
+        return true;
+    }
+
+    tracy_force_inline void AfterLock()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockObtain );
+        MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+        MemWrite( &item->lockObtain.id, m_id );
+        MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterUnlock()
+    {
+#ifdef TRACY_ON_DEMAND
+        m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
+        if( !m_active.load( std::memory_order_relaxed ) ) return;
+        if( !GetProfiler().IsConnected() )
+        {
+            m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockRelease );
+        MemWrite( &item->lockRelease.thread, GetThreadHandle() );
+        MemWrite( &item->lockRelease.id, m_id );
+        MemWrite( &item->lockRelease.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterTryLock( bool acquired )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !acquired ) return;
+
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return;
+#endif
+
+        if( acquired )
+        {
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::LockObtain );
+            MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+            MemWrite( &item->lockObtain.id, m_id );
+            MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+            Profiler::QueueSerialFinish();
+        }
+    }
+
+    tracy_force_inline bool BeforeLockShared()
+    {
+#ifdef TRACY_ON_DEMAND
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return false;
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockSharedWait );
+        MemWrite( &item->lockWait.thread, GetThreadHandle() );
+        MemWrite( &item->lockWait.id, m_id );
+        MemWrite( &item->lockWait.time, Profiler::GetTime() );
+        MemWrite( &item->lockWait.type, LockType::SharedLockable );
+        Profiler::QueueSerialFinish();
+        return true;
+    }
+
+    tracy_force_inline void AfterLockShared()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockSharedObtain );
+        MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+        MemWrite( &item->lockObtain.id, m_id );
+        MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterUnlockShared()
+    {
+#ifdef TRACY_ON_DEMAND
+        m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
+        if( !m_active.load( std::memory_order_relaxed ) ) return;
+        if( !GetProfiler().IsConnected() )
+        {
+            m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockSharedRelease );
+        MemWrite( &item->lockRelease.thread, GetThreadHandle() );
+        MemWrite( &item->lockRelease.id, m_id );
+        MemWrite( &item->lockRelease.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterTryLockShared( bool acquired )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !acquired ) return;
+
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return;
+#endif
+
+        if( acquired )
+        {
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::LockSharedObtain );
+            MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+            MemWrite( &item->lockObtain.id, m_id );
+            MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+            Profiler::QueueSerialFinish();
+        }
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+#ifdef TRACY_ON_DEMAND
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( !active ) return;
+        const auto connected = GetProfiler().IsConnected();
+        if( !connected )
+        {
+            if( active ) m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockMark );
+        MemWrite( &item->lockMark.thread, GetThreadHandle() );
+        MemWrite( &item->lockMark.id, m_id );
+        MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc );
+        Profiler::QueueSerialFinish();
+    }
+
+private:
+    uint32_t m_id;
+
+#ifdef TRACY_ON_DEMAND
+    std::atomic<uint32_t> m_lockCount;
+    std::atomic<bool> m_active;
+#endif
+};
+
+template<class T>
+class SharedLockable
+{
+public:
+    tracy_force_inline SharedLockable( const SourceLocationData* srcloc )
+        : m_ctx( srcloc )
+    {
+    }
+
+    SharedLockable( const SharedLockable& ) = delete;
+    SharedLockable& operator=( const SharedLockable& ) = delete;
+
+    tracy_force_inline void lock()
+    {
+        const auto runAfter = m_ctx.BeforeLock();
+        m_lockable.lock();
+        if( runAfter ) m_ctx.AfterLock();
+    }
+
+    tracy_force_inline void unlock()
+    {
+        m_lockable.unlock();
+        m_ctx.AfterUnlock();
+    }
+
+    tracy_force_inline bool try_lock()
+    {
+        const auto acquired = m_lockable.try_lock();
+        m_ctx.AfterTryLock( acquired );
+        return acquired;
+    }
+
+    tracy_force_inline void lock_shared()
+    {
+        const auto runAfter = m_ctx.BeforeLockShared();
+        m_lockable.lock_shared();
+        if( runAfter ) m_ctx.AfterLockShared();
+    }
+
+    tracy_force_inline void unlock_shared()
+    {
+        m_lockable.unlock_shared();
+        m_ctx.AfterUnlockShared();
+    }
+
+    tracy_force_inline bool try_lock_shared()
+    {
+        const auto acquired = m_lockable.try_lock_shared();
+        m_ctx.AfterTryLockShared( acquired );
+        return acquired;
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+        m_ctx.Mark( srcloc );
+    }
+
+private:
+    T m_lockable;
+    SharedLockableCtx m_ctx;
+};
+
+
+};
+
+#endif
diff --git a/libs/tracy/client/TracyProfiler.cpp b/libs/tracy/client/TracyProfiler.cpp
@@ -0,0 +1,2729 @@
+#ifdef TRACY_ENABLE
+
+#ifdef _WIN32
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <winsock2.h>
+#  include <windows.h>
+#  include <tlhelp32.h>
+#  include <inttypes.h>
+#  include <intrin.h>
+#else
+#  include <sys/time.h>
+#  include <sys/param.h>
+#endif
+
+#ifdef __CYGWIN__
+#  include <windows.h>
+#  include <unistd.h>
+#  include <tlhelp32.h>
+#endif
+
+#ifdef _GNU_SOURCE
+#  include <errno.h>
+#endif
+
+#ifdef __linux__
+#  include <dirent.h>
+#  include <signal.h>
+#  include <pthread.h>
+#  include <sys/types.h>
+#  include <sys/syscall.h>
+#endif
+
+#if defined __APPLE__ || defined BSD
+#  include <sys/types.h>
+#  include <sys/sysctl.h>
+#endif
+
+#include <algorithm>
+#include <assert.h>
+#include <atomic>
+#include <chrono>
+#include <limits>
+#include <new>
+#include <stdlib.h>
+#include <string.h>
+#include <thread>
+
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyProtocol.hpp"
+#include "../common/TracySocket.hpp"
+#include "../common/TracySystem.hpp"
+#include "../common/tracy_lz4.hpp"
+#include "tracy_rpmalloc.hpp"
+#include "TracyCallstack.hpp"
+#include "TracyDxt1.hpp"
+#include "TracyScoped.hpp"
+#include "TracyProfiler.hpp"
+#include "TracyThread.hpp"
+#include "TracyArmCpuTable.hpp"
+#include "TracySysTrace.hpp"
+#include "../TracyC.h"
+
+#ifdef __APPLE__
+#  define TRACY_DELAYED_INIT
+#else
+#  ifdef __GNUC__
+#    define init_order( val ) __attribute__ ((init_priority(val)))
+#  else
+#    define init_order(x)
+#  endif
+#endif
+
+#if defined TRACY_HW_TIMER && __ARM_ARCH >= 6 && !defined TARGET_OS_IOS
+#  include <signal.h>
+#  include <setjmp.h>
+#endif
+
+#if defined _WIN32 || defined __CYGWIN__
+#  include <lmcons.h>
+extern "C" typedef LONG (WINAPI *t_RtlGetVersion)( PRTL_OSVERSIONINFOW );
+#else
+#  include <unistd.h>
+#  include <limits.h>
+#endif
+#if defined __APPLE__
+#  include "TargetConditionals.h"
+#endif
+#if defined __linux__
+#  include <sys/sysinfo.h>
+#  include <sys/utsname.h>
+#endif
+
+#if !defined _WIN32 && !defined __CYGWIN__ && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+#  include <cpuid.h>
+#endif
+
+#if !( ( ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA ) || defined __linux__ )
+#  include <mutex>
+#endif
+
+namespace tracy
+{
+
+#ifndef TRACY_DELAYED_INIT
+namespace
+{
+#  if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA
+    BOOL CALLBACK InitOnceCallback( PINIT_ONCE /*initOnce*/, PVOID /*Parameter*/, PVOID* /*Context*/)
+    {
+        rpmalloc_initialize();
+        return TRUE;
+    }
+    INIT_ONCE InitOnce = INIT_ONCE_STATIC_INIT;
+#  elif defined __linux__
+    void InitOnceCallback()
+    {
+        rpmalloc_initialize();
+    }
+    pthread_once_t once_control = PTHREAD_ONCE_INIT;
+#  else
+    void InitOnceCallback()
+    {
+        rpmalloc_initialize();
+    }
+    std::once_flag once_flag;
+#  endif
+}
+
+struct RPMallocInit
+{
+    RPMallocInit()
+    {
+#  if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA
+        InitOnceExecuteOnce( &InitOnce, InitOnceCallback, nullptr, nullptr );
+#  elif defined __linux__
+        pthread_once( &once_control, InitOnceCallback );
+#  else
+        std::call_once( once_flag, InitOnceCallback );
+#  endif
+        rpmalloc_thread_initialize();
+    }
+};
+
+struct InitTimeWrapper
+{
+    int64_t val;
+};
+
+struct ProducerWrapper
+{
+    tracy::moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptr;
+};
+
+struct ThreadHandleWrapper
+{
+    uint64_t val;
+};
+#endif
+
+
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+static inline void CpuId( uint32_t* regs, uint32_t leaf )
+{
+#if defined _WIN32 || defined __CYGWIN__
+    __cpuidex( (int*)regs, leaf, 0 );
+#else
+    __get_cpuid( leaf, regs, regs+1, regs+2, regs+3 );
+#endif
+}
+
+static void InitFailure( const char* msg )
+{
+#if defined _WIN32 || defined __CYGWIN__
+    bool hasConsole = false;
+    bool reopen = false;
+    const auto attached = AttachConsole( ATTACH_PARENT_PROCESS );
+    if( attached )
+    {
+        hasConsole = true;
+        reopen = true;
+    }
+    else
+    {
+        const auto err = GetLastError();
+        if( err == ERROR_ACCESS_DENIED )
+        {
+            hasConsole = true;
+        }
+    }
+    if( hasConsole )
+    {
+        fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg );
+        if( reopen )
+        {
+            freopen( "CONOUT$", "w", stderr );
+            fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg );
+        }
+    }
+    else
+    {
+        MessageBoxA( nullptr, msg, "Tracy Profiler initialization failure", MB_ICONSTOP );
+    }
+#else
+    fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg );
+#endif
+    exit( 0 );
+}
+
+static int64_t SetupHwTimer()
+{
+    uint32_t regs[4];
+    CpuId( regs, 0x80000001 );
+    if( !( regs[3] & ( 1 << 27 ) ) ) InitFailure( "CPU doesn't support RDTSCP instruction." );
+    CpuId( regs, 0x80000007 );
+    if( !( regs[3] & ( 1 << 8 ) ) )
+    {
+        const char* noCheck = getenv( "TRACY_NO_INVARIANT_CHECK" );
+        if( !noCheck || noCheck[0] != '1' )
+        {
+            InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*." );
+        }
+    }
+
+    return Profiler::GetTime();
+}
+#else
+static int64_t SetupHwTimer()
+{
+    return Profiler::GetTime();
+}
+#endif
+
+static const char* GetProcessName()
+{
+    const char* processName = "unknown";
+#ifdef _WIN32
+    static char buf[_MAX_PATH];
+    GetModuleFileNameA( nullptr, buf, _MAX_PATH );
+    const char* ptr = buf;
+    while( *ptr != '\0' ) ptr++;
+    while( ptr > buf && *ptr != '\\' && *ptr != '/' ) ptr--;
+    if( ptr > buf ) ptr++;
+    processName = ptr;
+#elif defined __ANDROID__
+#  if __ANDROID_API__ >= 21
+    auto buf = getprogname();
+    if( buf ) processName = buf;
+#  endif
+#elif defined _GNU_SOURCE || defined __CYGWIN__
+    processName = program_invocation_short_name;
+#elif defined __APPLE__ || defined BSD
+    auto buf = getprogname();
+    if( buf ) processName = buf;
+#endif
+    return processName;
+}
+
+static uint32_t GetHex( char*& ptr, int skip )
+{
+    uint32_t ret;
+    ptr += skip;
+    char* end;
+    if( ptr[0] == '0' && ptr[1] == 'x' )
+    {
+        ptr += 2;
+        ret = strtol( ptr, &end, 16 );
+    }
+    else
+    {
+        ret = strtol( ptr, &end, 10 );
+    }
+    ptr = end;
+    return ret;
+}
+
+static const char* GetHostInfo()
+{
+    static char buf[1024];
+    auto ptr = buf;
+#if defined _WIN32 || defined __CYGWIN__
+#  ifdef UNICODE
+    t_RtlGetVersion RtlGetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandle( L"ntdll.dll" ), "RtlGetVersion" );
+#  else
+    t_RtlGetVersion RtlGetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandle( "ntdll.dll" ), "RtlGetVersion" );
+#  endif
+
+    if( !RtlGetVersion )
+    {
+#  ifdef __CYGWIN__
+        ptr += sprintf( ptr, "OS: Windows (Cygwin)\n" );
+#  elif defined __MINGW32__
+        ptr += sprintf( ptr, "OS: Windows (MingW)\n" );
+#  else
+        ptr += sprintf( ptr, "OS: Windows\n" );
+#  endif
+    }
+    else
+    {
+        RTL_OSVERSIONINFOW ver = { sizeof( RTL_OSVERSIONINFOW ) };
+        RtlGetVersion( &ver );
+
+#  ifdef __CYGWIN__
+        ptr += sprintf( ptr, "OS: Windows %i.%i.%i (Cygwin)\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber );
+#  elif defined __MINGW32__
+        ptr += sprintf( ptr, "OS: Windows %i.%i.%i (MingW)\n", (int)ver.dwMajorVersion, (int)ver.dwMinorVersion, (int)ver.dwBuildNumber );
+#  else
+        ptr += sprintf( ptr, "OS: Windows %i.%i.%i\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber );
+#  endif
+    }
+#elif defined __linux__
+    struct utsname utsName;
+    uname( &utsName );
+#  if defined __ANDROID__
+    ptr += sprintf( ptr, "OS: Linux %s (Android)\n", utsName.release );
+#  else
+    ptr += sprintf( ptr, "OS: Linux %s\n", utsName.release );
+#  endif
+#elif defined __APPLE__
+#  if TARGET_OS_IPHONE == 1
+    ptr += sprintf( ptr, "OS: Darwin (iOS)\n" );
+#  elif TARGET_OS_MAC == 1
+    ptr += sprintf( ptr, "OS: Darwin (OSX)\n" );
+#  else
+    ptr += sprintf( ptr, "OS: Darwin (unknown)\n" );
+#  endif
+#elif defined __DragonFly__
+    ptr += sprintf( ptr, "OS: BSD (DragonFly)\n" );
+#elif defined __FreeBSD__
+    ptr += sprintf( ptr, "OS: BSD (FreeBSD)\n" );
+#elif defined __NetBSD__
+    ptr += sprintf( ptr, "OS: BSD (NetBSD)\n" );
+#elif defined __OpenBSD__
+    ptr += sprintf( ptr, "OS: BSD (OpenBSD)\n" );
+#else
+    ptr += sprintf( ptr, "OS: unknown\n" );
+#endif
+
+#if defined _MSC_VER
+#  if defined __clang__
+    ptr += sprintf( ptr, "Compiler: MSVC clang-cl %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ );
+#  else
+    ptr += sprintf( ptr, "Compiler: MSVC %i\n", _MSC_VER );
+#  endif
+#elif defined __clang__
+    ptr += sprintf( ptr, "Compiler: clang %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ );
+#elif defined __GNUC__
+    ptr += sprintf( ptr, "Compiler: gcc %i.%i\n", __GNUC__, __GNUC_MINOR__ );
+#else
+    ptr += sprintf( ptr, "Compiler: unknown\n" );
+#endif
+
+#if defined _WIN32 || defined __CYGWIN__
+#  ifndef __CYGWIN__
+    InitWinSock();
+#  endif
+    char hostname[512];
+    gethostname( hostname, 512 );
+
+    DWORD userSz = UNLEN+1;
+    char user[UNLEN+1];
+    GetUserNameA( user, &userSz );
+
+    ptr += sprintf( ptr, "User: %s@%s\n", user, hostname );
+#else
+    char hostname[_POSIX_HOST_NAME_MAX]{};
+    char user[_POSIX_LOGIN_NAME_MAX]{};
+
+    gethostname( hostname, _POSIX_HOST_NAME_MAX );
+#  if defined __ANDROID__
+    const auto login = getlogin();
+    if( login )
+    {
+        strcpy( user, login );
+    }
+    else
+    {
+        memcpy( user, "(?)", 4 );
+    }
+#  else
+    getlogin_r( user, _POSIX_LOGIN_NAME_MAX );
+#  endif
+
+    ptr += sprintf( ptr, "User: %s@%s\n", user, hostname );
+#endif
+
+#if defined __i386 || defined _M_IX86
+    ptr += sprintf( ptr, "Arch: x86\n" );
+#elif defined __x86_64__ || defined _M_X64
+    ptr += sprintf( ptr, "Arch: x64\n" );
+#elif defined __aarch64__
+    ptr += sprintf( ptr, "Arch: ARM64\n" );
+#elif defined __ARM_ARCH
+    ptr += sprintf( ptr, "Arch: ARM\n" );
+#else
+    ptr += sprintf( ptr, "Arch: unknown\n" );
+#endif
+
+#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+    uint32_t regs[4];
+    char cpuModel[4*4*3];
+    auto modelPtr = cpuModel;
+    for( uint32_t i=0x80000002; i<0x80000005; ++i )
+    {
+#  if defined _WIN32 || defined __CYGWIN__
+        __cpuidex( (int*)regs, i, 0 );
+#  else
+        int zero = 0;
+        asm volatile ( "cpuid" : "=a" (regs[0]), "=b" (regs[1]), "=c" (regs[2]), "=d" (regs[3]) : "a" (i), "c" (zero) );
+#  endif
+        memcpy( modelPtr, regs, sizeof( regs ) ); modelPtr += sizeof( regs );
+    }
+
+    ptr += sprintf( ptr, "CPU: %s\n", cpuModel );
+#elif defined __linux__ && defined __ARM_ARCH
+    bool cpuFound = false;
+    FILE* fcpuinfo = fopen( "/proc/cpuinfo", "rb" );
+    if( fcpuinfo )
+    {
+        enum { BufSize = 4*1024 };
+        char buf[BufSize];
+        const auto sz = fread( buf, 1, BufSize, fcpuinfo );
+        fclose( fcpuinfo );
+        const auto end = buf + sz;
+        auto cptr = buf;
+
+        uint32_t impl = 0;
+        uint32_t var = 0;
+        uint32_t part = 0;
+        uint32_t rev = 0;
+
+        while( end - cptr > 20 )
+        {
+            while( end - cptr > 20 && memcmp( cptr, "CPU ", 4 ) != 0 )
+            {
+                cptr += 4;
+                while( end - cptr > 20 && *cptr != '\n' ) cptr++;
+                cptr++;
+            }
+            if( end - cptr <= 20 ) break;
+            cptr += 4;
+            if( memcmp( cptr, "implementer\t: ", 14 ) == 0 )
+            {
+                if( impl != 0 ) break;
+                impl = GetHex( cptr, 14 );
+            }
+            else if( memcmp( cptr, "variant\t: ", 10 ) == 0 ) var = GetHex( cptr, 10 );
+            else if( memcmp( cptr, "part\t: ", 7 ) == 0 ) part = GetHex( cptr, 7 );
+            else if( memcmp( cptr, "revision\t: ", 11 ) == 0 ) rev = GetHex( cptr, 11 );
+            while( *cptr != '\n' && *cptr != '\0' ) cptr++;
+            cptr++;
+        }
+
+        if( impl != 0 || var != 0 || part != 0 || rev != 0 )
+        {
+            cpuFound = true;
+            ptr += sprintf( ptr, "CPU: %s%s r%ip%i\n", DecodeArmImplementer( impl ), DecodeArmPart( impl, part ), var, rev );
+        }
+    }
+    if( !cpuFound )
+    {
+        ptr += sprintf( ptr, "CPU: unknown\n" );
+    }
+#elif defined __APPLE__ && TARGET_OS_IPHONE == 1
+    {
+        size_t sz;
+        sysctlbyname( "hw.machine", nullptr, &sz, nullptr, 0 );
+        auto str = (char*)tracy_malloc( sz );
+        sysctlbyname( "hw.machine", str, &sz, nullptr, 0 );
+        ptr += sprintf( ptr, "Device: %s\n", DecodeIosDevice( str ) );
+        tracy_free( str );
+    }
+#else
+    ptr += sprintf( ptr, "CPU: unknown\n" );
+#endif
+
+    ptr += sprintf( ptr, "CPU cores: %i\n", std::thread::hardware_concurrency() );
+
+#if defined _WIN32 || defined __CYGWIN__
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof( statex );
+    GlobalMemoryStatusEx( &statex );
+#  ifdef _MSC_VER
+    ptr += sprintf( ptr, "RAM: %I64u MB\n", statex.ullTotalPhys / 1024 / 1024 );
+#  else
+    ptr += sprintf( ptr, "RAM: %llu MB\n", statex.ullTotalPhys / 1024 / 1024 );
+#  endif
+#elif defined __linux__
+    struct sysinfo sysInfo;
+    sysinfo( &sysInfo );
+    ptr += sprintf( ptr, "RAM: %lu MB\n", sysInfo.totalram / 1024 / 1024 );
+#elif defined __APPLE__
+    size_t memSize;
+    size_t sz = sizeof( memSize );
+    sysctlbyname( "hw.memsize", &memSize, &sz, nullptr, 0 );
+    ptr += sprintf( ptr, "RAM: %zu MB\n", memSize / 1024 / 1024 );
+#elif defined BSD
+    size_t memSize;
+    size_t sz = sizeof( memSize );
+    sysctlbyname( "hw.physmem", &memSize, &sz, nullptr, 0 );
+    ptr += sprintf( ptr, "RAM: %zu MB\n", memSize / 1024 / 1024 );
+#else
+    ptr += sprintf( ptr, "RAM: unknown\n" );
+#endif
+
+    return buf;
+}
+
+static uint64_t GetPid()
+{
+#if defined _WIN32 || defined __CYGWIN__
+    return uint64_t( GetCurrentProcessId() );
+#else
+    return uint64_t( getpid() );
+#endif
+}
+
+static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, int& len )
+{
+    static BroadcastMessage msg;
+
+    msg.broadcastVersion = BroadcastVersion;
+    msg.protocolVersion = ProtocolVersion;
+
+    memcpy( msg.programName, procname, pnsz );
+    memset( msg.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz );
+
+    len = int( offsetof( BroadcastMessage, programName ) + pnsz + 1 );
+    return msg;
+}
+
+#if defined _WIN32 || defined __CYGWIN__
+static DWORD s_profilerThreadId = 0;
+static char s_crashText[1024];
+
+LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp )
+{
+    const unsigned ec = pExp->ExceptionRecord->ExceptionCode;
+    auto msgPtr = s_crashText;
+    switch( ec )
+    {
+    case EXCEPTION_ACCESS_VIOLATION:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ACCESS_VIOLATION (0x%x). ", ec );
+        switch( pExp->ExceptionRecord->ExceptionInformation[0] )
+        {
+        case 0:
+            msgPtr += sprintf( msgPtr, "Read violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] );
+            break;
+        case 1:
+            msgPtr += sprintf( msgPtr, "Write violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] );
+            break;
+        case 8:
+            msgPtr += sprintf( msgPtr, "DEP violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] );
+            break;
+        default:
+            break;
+        }
+        break;
+    case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ARRAY_BOUNDS_EXCEEDED (0x%x). ", ec );
+        break;
+    case EXCEPTION_DATATYPE_MISALIGNMENT:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_DATATYPE_MISALIGNMENT (0x%x). ", ec );
+        break;
+    case EXCEPTION_FLT_DIVIDE_BY_ZERO:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_FLT_DIVIDE_BY_ZERO (0x%x). ", ec );
+        break;
+    case EXCEPTION_ILLEGAL_INSTRUCTION:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ILLEGAL_INSTRUCTION (0x%x). ", ec );
+        break;
+    case EXCEPTION_IN_PAGE_ERROR:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_IN_PAGE_ERROR (0x%x). ", ec );
+        break;
+    case EXCEPTION_INT_DIVIDE_BY_ZERO:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_INT_DIVIDE_BY_ZERO (0x%x). ", ec );
+        break;
+    case EXCEPTION_PRIV_INSTRUCTION:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_PRIV_INSTRUCTION (0x%x). ", ec );
+        break;
+    case EXCEPTION_STACK_OVERFLOW:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_STACK_OVERFLOW (0x%x). ", ec );
+        break;
+    default:
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+
+    {
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::CrashReport );
+        item->crashReport.time = Profiler::GetTime();
+        item->crashReport.text = (uint64_t)s_crashText;
+        tail.store( magic + 1, std::memory_order_release );
+
+        GetProfiler().SendCallstack( 60, "KiUserExceptionDispatcher" );
+    }
+
+    HANDLE h = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 );
+    if( h == INVALID_HANDLE_VALUE ) return EXCEPTION_CONTINUE_SEARCH;
+
+    THREADENTRY32 te = { sizeof( te ) };
+    if( !Thread32First( h, &te ) )
+    {
+        CloseHandle( h );
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+
+    const auto pid = GetCurrentProcessId();
+    const auto tid = GetCurrentThreadId();
+
+    do
+    {
+        if( te.th32OwnerProcessID == pid && te.th32ThreadID != tid && te.th32ThreadID != s_profilerThreadId )
+        {
+            HANDLE th = OpenThread( THREAD_SUSPEND_RESUME, FALSE, te.th32ThreadID );
+            if( th != INVALID_HANDLE_VALUE )
+            {
+                SuspendThread( th );
+                CloseHandle( th );
+            }
+        }
+    }
+    while( Thread32Next( h, &te ) );
+    CloseHandle( h );
+
+    {
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::Crash );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) );
+    GetProfiler().RequestShutdown();
+    while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); };
+
+    TerminateProcess( GetCurrentProcess(), 1 );
+
+    return EXCEPTION_CONTINUE_SEARCH;
+}
+#endif
+
+#ifdef __linux__
+static long s_profilerTid = 0;
+static char s_crashText[1024];
+static std::atomic<bool> s_alreadyCrashed( false );
+
+static void ThreadFreezer( int /*signal*/ )
+{
+    for(;;) sleep( 1000 );
+}
+
+static inline void HexPrint( char*& ptr, uint64_t val )
+{
+    if( val == 0 )
+    {
+        *ptr++ = '0';
+        return;
+    }
+
+    static const char HexTable[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+    char buf[16];
+    auto bptr = buf;
+
+    do
+    {
+        *bptr++ = HexTable[val%16];
+        val /= 16;
+    }
+    while( val > 0 );
+
+    do
+    {
+        *ptr++ = *--bptr;
+    }
+    while( bptr != buf );
+}
+
+static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ )
+{
+    bool expected = false;
+    if( !s_alreadyCrashed.compare_exchange_strong( expected, true ) ) ThreadFreezer( signal );
+
+    auto msgPtr = s_crashText;
+    switch( signal )
+    {
+    case SIGILL:
+        strcpy( msgPtr, "Illegal Instruction.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case ILL_ILLOPC:
+            strcpy( msgPtr, "Illegal opcode.\n" );
+            break;
+        case ILL_ILLOPN:
+            strcpy( msgPtr, "Illegal operand.\n" );
+            break;
+        case ILL_ILLADR:
+            strcpy( msgPtr, "Illegal addressing mode.\n" );
+            break;
+        case ILL_ILLTRP:
+            strcpy( msgPtr, "Illegal trap.\n" );
+            break;
+        case ILL_PRVOPC:
+            strcpy( msgPtr, "Privileged opcode.\n" );
+            break;
+        case ILL_PRVREG:
+            strcpy( msgPtr, "Privileged register.\n" );
+            break;
+        case ILL_COPROC:
+            strcpy( msgPtr, "Coprocessor error.\n" );
+            break;
+        case ILL_BADSTK:
+            strcpy( msgPtr, "Internal stack error.\n" );
+            break;
+        default:
+            break;
+        }
+        break;
+    case SIGFPE:
+        strcpy( msgPtr, "Floating-point exception.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case FPE_INTDIV:
+            strcpy( msgPtr, "Integer divide by zero.\n" );
+            break;
+        case FPE_INTOVF:
+            strcpy( msgPtr, "Integer overflow.\n" );
+            break;
+        case FPE_FLTDIV:
+            strcpy( msgPtr, "Floating-point divide by zero.\n" );
+            break;
+        case FPE_FLTOVF:
+            strcpy( msgPtr, "Floating-point overflow.\n" );
+            break;
+        case FPE_FLTUND:
+            strcpy( msgPtr, "Floating-point underflow.\n" );
+            break;
+        case FPE_FLTRES:
+            strcpy( msgPtr, "Floating-point inexact result.\n" );
+            break;
+        case FPE_FLTINV:
+            strcpy( msgPtr, "Floating-point invalid operation.\n" );
+            break;
+        case FPE_FLTSUB:
+            strcpy( msgPtr, "Subscript out of range.\n" );
+            break;
+        default:
+            break;
+        }
+        break;
+    case SIGSEGV:
+        strcpy( msgPtr, "Invalid memory reference.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case SEGV_MAPERR:
+            strcpy( msgPtr, "Address not mapped to object.\n" );
+            break;
+        case SEGV_ACCERR:
+            strcpy( msgPtr, "Invalid permissions for mapped object.\n" );
+            break;
+#  ifdef SEGV_BNDERR
+        case SEGV_BNDERR:
+            strcpy( msgPtr, "Failed address bound checks.\n" );
+            break;
+#  endif
+#  ifdef SEGV_PKUERR
+        case SEGV_PKUERR:
+            strcpy( msgPtr, "Access was denied by memory protection keys.\n" );
+            break;
+#  endif
+        default:
+            break;
+        }
+        break;
+    case SIGPIPE:
+        strcpy( msgPtr, "Broken pipe.\n" );
+        while( *msgPtr ) msgPtr++;
+        break;
+    case SIGBUS:
+        strcpy( msgPtr, "Bus error.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case BUS_ADRALN:
+            strcpy( msgPtr, "Invalid address alignment.\n" );
+            break;
+        case BUS_ADRERR:
+            strcpy( msgPtr, "Nonexistent physical address.\n" );
+            break;
+        case BUS_OBJERR:
+            strcpy( msgPtr, "Object-specific hardware error.\n" );
+            break;
+        case BUS_MCEERR_AR:
+            strcpy( msgPtr, "Hardware memory error consumed on a machine check; action required.\n" );
+            break;
+        case BUS_MCEERR_AO:
+            strcpy( msgPtr, "Hardware memory error detected in process but not consumed; action optional.\n" );
+            break;
+        default:
+            break;
+        }
+        break;
+    default:
+        abort();
+    }
+    while( *msgPtr ) msgPtr++;
+
+    if( signal != SIGPIPE )
+    {
+        strcpy( msgPtr, "Fault address: 0x" );
+        while( *msgPtr ) msgPtr++;
+        HexPrint( msgPtr, uint64_t( info->si_addr ) );
+        *msgPtr++ = '\n';
+    }
+
+    {
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::CrashReport );
+        item->crashReport.time = Profiler::GetTime();
+        item->crashReport.text = (uint64_t)s_crashText;
+        tail.store( magic + 1, std::memory_order_release );
+
+        GetProfiler().SendCallstack( 60, "__kernel_rt_sigreturn" );
+    }
+
+    DIR* dp = opendir( "/proc/self/task" );
+    if( !dp ) abort();
+
+    const auto selfTid = syscall( SYS_gettid );
+
+    struct dirent* ep;
+    while( ( ep = readdir( dp ) ) != nullptr )
+    {
+        if( ep->d_name[0] == '.' ) continue;
+        int tid = atoi( ep->d_name );
+        if( tid != selfTid && tid != s_profilerTid )
+        {
+            syscall( SYS_tkill, tid, SIGPWR );
+        }
+    }
+    closedir( dp );
+
+    {
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::Crash );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) );
+    GetProfiler().RequestShutdown();
+    while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); };
+
+    abort();
+}
+#endif
+
+
+enum { QueuePrealloc = 256 * 1024 };
+
+static Profiler* s_instance;
+static Thread* s_thread;
+static Thread* s_compressThread;
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+static Thread* s_sysTraceThread = nullptr;
+#endif
+
+#ifdef TRACY_DELAYED_INIT
+struct ThreadNameData;
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue();
+
+struct RPMallocInit { RPMallocInit() { rpmalloc_initialize(); } };
+
+TRACY_API void InitRPMallocThread()
+{
+    rpmalloc_initialize();
+    rpmalloc_thread_initialize();
+}
+
+struct ProfilerData
+{
+    int64_t initTime = SetupHwTimer();
+    RPMallocInit rpmalloc_init;
+    moodycamel::ConcurrentQueue<QueueItem> queue;
+    Profiler profiler;
+    std::atomic<uint32_t> lockCounter { 0 };
+    std::atomic<uint8_t> gpuCtxCounter { 0 };
+    std::atomic<ThreadNameData*> threadNameData { nullptr };
+};
+
+struct ProducerWrapper
+{
+    ProducerWrapper( ProfilerData& data ) : detail( data.queue ), ptr( data.queue.get_explicit_producer( detail ) ) {}
+    moodycamel::ProducerToken detail;
+    tracy::moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptr;
+};
+
+struct ProfilerThreadData
+{
+    ProfilerThreadData( ProfilerData& data ) : token( data ), gpuCtx( { nullptr } ) {}
+    RPMallocInit rpmalloc_init;
+    ProducerWrapper token;
+    GpuCtxWrapper gpuCtx;
+#  ifdef TRACY_ON_DEMAND
+    LuaZoneState luaZoneState;
+#  endif
+};
+
+static std::atomic<int> profilerDataLock { 0 };
+static std::atomic<ProfilerData*> profilerData { nullptr };
+
+static ProfilerData& GetProfilerData()
+{
+    auto ptr = profilerData.load( std::memory_order_acquire );
+    if( !ptr )
+    {
+        int expected = 0;
+        while( !profilerDataLock.compare_exchange_strong( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; }
+        ptr = profilerData.load( std::memory_order_acquire );
+        if( !ptr )
+        {
+            ptr = (ProfilerData*)malloc( sizeof( ProfilerData ) );
+            new (ptr) ProfilerData();
+            profilerData.store( ptr, std::memory_order_release );
+        }
+        profilerDataLock.store( 0, std::memory_order_release );
+    }
+    return *ptr;
+}
+
+static ProfilerThreadData& GetProfilerThreadData()
+{
+    thread_local ProfilerThreadData data( GetProfilerData() );
+    return data;
+}
+
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken() { return GetProfilerThreadData().token.ptr; }
+TRACY_API Profiler& GetProfiler() { return GetProfilerData().profiler; }
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue() { return GetProfilerData().queue; }
+TRACY_API int64_t GetInitTime() { return GetProfilerData().initTime; }
+TRACY_API std::atomic<uint32_t>& GetLockCounter() { return GetProfilerData().lockCounter; }
+TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter() { return GetProfilerData().gpuCtxCounter; }
+TRACY_API GpuCtxWrapper& GetGpuCtx() { return GetProfilerThreadData().gpuCtx; }
+TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); }
+TRACY_API std::atomic<ThreadNameData*>& GetThreadNameData() { return GetProfilerData().threadNameData; }
+
+#  ifdef TRACY_ON_DEMAND
+TRACY_API LuaZoneState& GetLuaZoneState() { return GetProfilerThreadData().luaZoneState; }
+#  endif
+
+#else
+TRACY_API void InitRPMallocThread()
+{
+    rpmalloc_thread_initialize();
+}
+
+// MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this.
+
+// 1a. But s_queue is needed for initialization of variables in point 2.
+extern moodycamel::ConcurrentQueue<QueueItem> s_queue;
+
+thread_local RPMallocInit init_order(106) s_rpmalloc_thread_init;
+
+// 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread.
+thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue );
+thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) };
+thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThreadHandleImpl() };
+
+#  ifdef _MSC_VER
+// 1. Initialize these static variables before all other variables.
+#    pragma warning( disable : 4075 )
+#    pragma init_seg( ".CRT$XCB" )
+#  endif
+
+static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() };
+static RPMallocInit init_order(102) s_rpmalloc_init;
+moodycamel::ConcurrentQueue<QueueItem> init_order(103) s_queue( QueuePrealloc );
+std::atomic<uint32_t> init_order(104) s_lockCounter( 0 );
+std::atomic<uint8_t> init_order(104) s_gpuCtxCounter( 0 );
+
+thread_local GpuCtxWrapper init_order(104) s_gpuCtx { nullptr };
+
+struct ThreadNameData;
+static std::atomic<ThreadNameData*> init_order(104) s_threadNameDataInstance( nullptr );
+std::atomic<ThreadNameData*>& s_threadNameData = s_threadNameDataInstance;
+
+#  ifdef TRACY_ON_DEMAND
+thread_local LuaZoneState init_order(104) s_luaZoneState { 0, false };
+#  endif
+
+static Profiler init_order(105) s_profiler;
+
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken() { return s_token.ptr; }
+TRACY_API Profiler& GetProfiler() { return s_profiler; }
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue() { return s_queue; }
+TRACY_API int64_t GetInitTime() { return s_initTime.val; }
+TRACY_API std::atomic<uint32_t>& GetLockCounter() { return s_lockCounter; }
+TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter() { return s_gpuCtxCounter; }
+TRACY_API GpuCtxWrapper& GetGpuCtx() { return s_gpuCtx; }
+#  ifdef __CYGWIN__
+// Hackfix for cygwin reporting memory frees without matching allocations. WTF?
+TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); }
+#  else
+TRACY_API uint64_t GetThreadHandle() { return s_threadHandle.val; }
+#  endif
+
+TRACY_API std::atomic<ThreadNameData*>& GetThreadNameData() { return s_threadNameData; }
+
+#  ifdef TRACY_ON_DEMAND
+TRACY_API LuaZoneState& GetLuaZoneState() { return s_luaZoneState; }
+#  endif
+#endif
+
+enum { BulkSize = TargetFrameSize / QueueItemSize };
+
+Profiler::Profiler()
+    : m_timeBegin( 0 )
+    , m_mainThread( detail::GetThreadHandleImpl() )
+    , m_epoch( std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count() )
+    , m_shutdown( false )
+    , m_shutdownManual( false )
+    , m_shutdownFinished( false )
+    , m_sock( nullptr )
+    , m_broadcast( nullptr )
+    , m_noExit( false )
+    , m_zoneId( 1 )
+    , m_stream( LZ4_createStream() )
+    , m_buffer( (char*)tracy_malloc( TargetFrameSize*3 ) )
+    , m_bufferOffset( 0 )
+    , m_bufferStart( 0 )
+    , m_itemBuf( (QueueItem*)tracy_malloc( sizeof( QueueItem ) * BulkSize ) )
+    , m_lz4Buf( (char*)tracy_malloc( LZ4Size + sizeof( lz4sz_t ) ) )
+    , m_serialQueue( 1024*1024 )
+    , m_serialDequeue( 1024*1024 )
+    , m_fiQueue( 16 )
+    , m_fiDequeue( 16 )
+    , m_frameCount( 0 )
+#ifdef TRACY_ON_DEMAND
+    , m_isConnected( false )
+    , m_connectionId( 0 )
+    , m_deferredQueue( 64*1024 )
+#endif
+    , m_paramCallback( nullptr )
+{
+    assert( !s_instance );
+    s_instance = this;
+
+#ifndef TRACY_DELAYED_INIT
+#  ifdef _MSC_VER
+    // 3. But these variables need to be initialized in main thread within the .CRT$XCB section. Do it here.
+    s_token_detail = moodycamel::ProducerToken( s_queue );
+    s_token = ProducerWrapper { s_queue.get_explicit_producer( s_token_detail ) };
+    s_threadHandle = ThreadHandleWrapper { m_mainThread };
+#  endif
+#endif
+
+    CalibrateTimer();
+    CalibrateDelay();
+
+#ifndef TRACY_NO_EXIT
+    const char* noExitEnv = getenv( "TRACY_NO_EXIT" );
+    if( noExitEnv && noExitEnv[0] == '1' )
+    {
+        m_noExit = true;
+    }
+#endif
+
+    s_thread = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_thread) Thread( LaunchWorker, this );
+
+    s_compressThread = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_compressThread) Thread( LaunchCompressWorker, this );
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    if( SysTraceStart() )
+    {
+        s_sysTraceThread = (Thread*)tracy_malloc( sizeof( Thread ) );
+        new(s_sysTraceThread) Thread( SysTraceWorker, nullptr );
+    }
+#endif
+
+#if defined _WIN32 || defined __CYGWIN__
+    s_profilerThreadId = GetThreadId( s_thread->Handle() );
+    AddVectoredExceptionHandler( 1, CrashFilter );
+#endif
+
+#ifdef __linux__
+    struct sigaction threadFreezer = {};
+    threadFreezer.sa_handler = ThreadFreezer;
+    sigaction( SIGPWR, &threadFreezer, nullptr );
+
+    struct sigaction crashHandler = {};
+    crashHandler.sa_sigaction = CrashHandler;
+    crashHandler.sa_flags = SA_SIGINFO;
+    sigaction( SIGILL, &crashHandler, nullptr );
+    sigaction( SIGFPE, &crashHandler, nullptr );
+    sigaction( SIGSEGV, &crashHandler, nullptr );
+    sigaction( SIGPIPE, &crashHandler, nullptr );
+    sigaction( SIGBUS, &crashHandler, nullptr );
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+    InitCallstack();
+#endif
+
+    m_timeBegin.store( GetTime(), std::memory_order_relaxed );
+}
+
+Profiler::~Profiler()
+{
+    m_shutdown.store( true, std::memory_order_relaxed );
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    if( s_sysTraceThread )
+    {
+        SysTraceStop();
+        s_sysTraceThread->~Thread();
+        tracy_free( s_sysTraceThread );
+    }
+#endif
+
+    s_compressThread->~Thread();
+    tracy_free( s_compressThread );
+    s_thread->~Thread();
+    tracy_free( s_thread );
+
+    tracy_free( m_lz4Buf );
+    tracy_free( m_itemBuf );
+    tracy_free( m_buffer );
+    LZ4_freeStream( (LZ4_stream_t*)m_stream );
+
+    if( m_sock )
+    {
+        m_sock->~Socket();
+        tracy_free( m_sock );
+    }
+
+    if( m_broadcast )
+    {
+        m_broadcast->~UdpBroadcast();
+        tracy_free( m_broadcast );
+    }
+
+    assert( s_instance );
+    s_instance = nullptr;
+}
+
+bool Profiler::ShouldExit()
+{
+    return s_instance->m_shutdown.load( std::memory_order_relaxed );
+}
+
+void Profiler::Worker()
+{
+#ifdef __linux__
+    s_profilerTid = syscall( SYS_gettid );
+#endif
+
+    SetThreadName( "Tracy Profiler" );
+
+#ifdef TRACY_PORT
+    const auto port = TRACY_PORT;
+#else
+    const auto port = 8086;
+#endif
+
+    while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
+    rpmalloc_thread_initialize();
+
+    const auto procname = GetProcessName();
+    const auto pnsz = std::min<size_t>( strlen( procname ), WelcomeMessageProgramNameSize - 1 );
+
+    const auto hostinfo = GetHostInfo();
+    const auto hisz = std::min<size_t>( strlen( hostinfo ), WelcomeMessageHostInfoSize - 1 );
+
+    const uint64_t pid = GetPid();
+
+#ifdef TRACY_ON_DEMAND
+    uint8_t onDemand = 1;
+#else
+    uint8_t onDemand = 0;
+#endif
+
+#ifdef __APPLE__
+    uint8_t isApple = 1;
+#else
+    uint8_t isApple = 0;
+#endif
+
+    WelcomeMessage welcome;
+    MemWrite( &welcome.timerMul, m_timerMul );
+    MemWrite( &welcome.initBegin, GetInitTime() );
+    MemWrite( &welcome.initEnd, m_timeBegin.load( std::memory_order_relaxed ) );
+    MemWrite( &welcome.delay, m_delay );
+    MemWrite( &welcome.resolution, m_resolution );
+    MemWrite( &welcome.epoch, m_epoch );
+    MemWrite( &welcome.pid, pid );
+    MemWrite( &welcome.onDemand, onDemand );
+    MemWrite( &welcome.isApple, isApple );
+    memcpy( welcome.programName, procname, pnsz );
+    memset( welcome.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz );
+    memcpy( welcome.hostInfo, hostinfo, hisz );
+    memset( welcome.hostInfo + hisz, 0, WelcomeMessageHostInfoSize - hisz );
+
+    moodycamel::ConsumerToken token( GetQueue() );
+
+    ListenSocket listen;
+    if( !listen.Listen( port, 8 ) )
+    {
+        for(;;)
+        {
+            if( ShouldExit() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+
+            ClearQueues( token );
+        }
+    }
+
+#ifndef TRACY_NO_BROADCAST
+    m_broadcast = (UdpBroadcast*)tracy_malloc( sizeof( UdpBroadcast ) );
+    new(m_broadcast) UdpBroadcast();
+    if( !m_broadcast->Open( "255.255.255.255", port ) )
+    {
+        m_broadcast->~UdpBroadcast();
+        tracy_free( m_broadcast );
+        m_broadcast = nullptr;
+    }
+#endif
+
+    int broadcastLen = 0;
+    auto& broadcastMsg = GetBroadcastMessage( procname, pnsz, broadcastLen );
+    uint64_t lastBroadcast = 0;
+
+    // Connections loop.
+    // Each iteration of the loop handles whole connection. Multiple iterations will only
+    // happen in the on-demand mode or when handshake fails.
+    for(;;)
+    {
+        // Wait for incoming connection
+        for(;;)
+        {
+#ifndef TRACY_NO_EXIT
+            if( !m_noExit && ShouldExit() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+#endif
+            m_sock = listen.Accept();
+            if( m_sock ) break;
+#ifndef TRACY_ON_DEMAND
+            ProcessSysTime();
+#endif
+
+            if( m_broadcast )
+            {
+                const auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+                if( t - lastBroadcast > 3000000000 )  // 3s
+                {
+                    lastBroadcast = t;
+                    const auto ts = std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count();
+                    broadcastMsg.activeTime = uint32_t( ts - m_epoch );
+                    m_broadcast->Send( port, &broadcastMsg, broadcastLen );
+                }
+            }
+        }
+
+        // Handshake
+        {
+            char shibboleth[HandshakeShibbolethSize];
+            auto res = m_sock->ReadRaw( shibboleth, HandshakeShibbolethSize, 2000 );
+            if( !res || memcmp( shibboleth, HandshakeShibboleth, HandshakeShibbolethSize ) != 0 )
+            {
+                m_sock->~Socket();
+                tracy_free( m_sock );
+                m_sock = nullptr;
+                continue;
+            }
+
+            uint32_t protocolVersion;
+            res = m_sock->ReadRaw( &protocolVersion, sizeof( protocolVersion ), 2000 );
+            if( !res )
+            {
+                m_sock->~Socket();
+                tracy_free( m_sock );
+                m_sock = nullptr;
+                continue;
+            }
+
+            if( protocolVersion != ProtocolVersion )
+            {
+                HandshakeStatus status = HandshakeProtocolMismatch;
+                m_sock->Send( &status, sizeof( status ) );
+                m_sock->~Socket();
+                tracy_free( m_sock );
+                m_sock = nullptr;
+                continue;
+            }
+        }
+
+#ifdef TRACY_ON_DEMAND
+        const auto currentTime = GetTime();
+        ClearQueues( token );
+        m_connectionId.fetch_add( 1, std::memory_order_release );
+        m_isConnected.store( true, std::memory_order_release );
+#endif
+
+        HandshakeStatus handshake = HandshakeWelcome;
+        m_sock->Send( &handshake, sizeof( handshake ) );
+
+        LZ4_resetStream( (LZ4_stream_t*)m_stream );
+        m_sock->Send( &welcome, sizeof( welcome ) );
+
+        m_threadCtx = 0;
+        m_refTimeSerial = 0;
+        m_refTimeCtx = 0;
+        m_refTimeGpu = 0;
+
+#ifdef TRACY_ON_DEMAND
+        OnDemandPayloadMessage onDemand;
+        onDemand.frames = m_frameCount.load( std::memory_order_relaxed );
+        onDemand.currentTime = currentTime;
+
+        m_sock->Send( &onDemand, sizeof( onDemand ) );
+
+        m_deferredLock.lock();
+        for( auto& item : m_deferredQueue )
+        {
+            const auto idx = MemRead<uint8_t>( &item.hdr.idx );
+            if( (QueueType)idx == QueueType::MessageAppInfo )
+            {
+                uint64_t ptr = MemRead<uint64_t>( &item.message.text );
+                SendString( ptr, (const char*)ptr, QueueType::CustomStringData );
+            }
+            AppendData( &item, QueueDataSize[idx] );
+        }
+        m_deferredLock.unlock();
+#endif
+
+        // Main communications loop
+        int keepAlive = 0;
+        for(;;)
+        {
+            ProcessSysTime();
+            const auto status = Dequeue( token );
+            const auto serialStatus = DequeueSerial();
+            if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
+            {
+                break;
+            }
+            else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty )
+            {
+                if( ShouldExit() ) break;
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) break;
+                }
+                if( keepAlive == 500 )
+                {
+                    QueueItem ka;
+                    ka.hdr.type = QueueType::KeepAlive;
+                    AppendData( &ka, QueueDataSize[ka.hdr.idx] );
+                    if( !CommitData() ) break;
+
+                    keepAlive = 0;
+                }
+                else
+                {
+                    keepAlive++;
+                    std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+                }
+            }
+            else
+            {
+                keepAlive = 0;
+            }
+
+            bool connActive = true;
+            while( m_sock->HasData() && connActive )
+            {
+                connActive = HandleServerQuery();
+            }
+            if( !connActive ) break;
+        }
+        if( ShouldExit() ) break;
+
+#ifdef TRACY_ON_DEMAND
+        m_isConnected.store( false, std::memory_order_release );
+        m_bufferOffset = 0;
+        m_bufferStart = 0;
+#endif
+
+        m_sock->~Socket();
+        tracy_free( m_sock );
+        m_sock = nullptr;
+
+#ifndef TRACY_ON_DEMAND
+        // Client is no longer available here. Accept incoming connections, but reject handshake.
+        for(;;)
+        {
+            if( ShouldExit() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+
+            ClearQueues( token );
+
+            m_sock = listen.Accept();
+            if( m_sock )
+            {
+                char shibboleth[HandshakeShibbolethSize];
+                auto res = m_sock->ReadRaw( shibboleth, HandshakeShibbolethSize, 1000 );
+                if( !res || memcmp( shibboleth, HandshakeShibboleth, HandshakeShibbolethSize ) != 0 )
+                {
+                    m_sock->~Socket();
+                    tracy_free( m_sock );
+                    m_sock = nullptr;
+                    continue;
+                }
+
+                uint32_t protocolVersion;
+                res = m_sock->ReadRaw( &protocolVersion, sizeof( protocolVersion ), 1000 );
+                if( !res )
+                {
+                    m_sock->~Socket();
+                    tracy_free( m_sock );
+                    m_sock = nullptr;
+                    continue;
+                }
+
+                HandshakeStatus status = HandshakeNotAvailable;
+                m_sock->Send( &status, sizeof( status ) );
+                m_sock->~Socket();
+                tracy_free( m_sock );
+            }
+        }
+#endif
+    }
+    // End of connections loop
+
+    // Client is exiting. Send items remaining in queues.
+    for(;;)
+    {
+        const auto status = Dequeue( token );
+        const auto serialStatus = DequeueSerial();
+        if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
+        {
+            m_shutdownFinished.store( true, std::memory_order_relaxed );
+            return;
+        }
+        else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty )
+        {
+            if( m_bufferOffset != m_bufferStart ) CommitData();
+            break;
+        }
+
+        while( m_sock->HasData() )
+        {
+            if( !HandleServerQuery() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+        }
+    }
+
+    // Send client termination notice to the server
+    QueueItem terminate;
+    MemWrite( &terminate.hdr.type, QueueType::Terminate );
+    if( !SendData( (const char*)&terminate, 1 ) )
+    {
+        m_shutdownFinished.store( true, std::memory_order_relaxed );
+        return;
+    }
+    // Handle remaining server queries
+    { // XXX diesel changes
+        if( m_bufferOffset != m_bufferStart ) CommitData();
+        std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+    }
+    for(;;)
+    {
+        if( m_sock->HasData() )
+        {
+            while( m_sock->HasData() )
+            {
+                if( !HandleServerQuery() )
+                {
+                    m_shutdownFinished.store( true, std::memory_order_relaxed );
+                    return;
+                }
+            }
+            while( Dequeue( token ) == DequeueStatus::DataDequeued ) {}
+            while( DequeueSerial() == DequeueStatus::DataDequeued ) {}
+            if( m_bufferOffset != m_bufferStart )
+            {
+                if( !CommitData() )
+                {
+                    m_shutdownFinished.store( true, std::memory_order_relaxed );
+                    return;
+                }
+            }
+        }
+        else
+        {
+            // XXX diesel changes
+            // if( m_bufferOffset != m_bufferStart ) CommitData();
+            // std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+            m_shutdownFinished.store( true, std::memory_order_relaxed );
+            return;
+        }
+    }
+}
+
+void Profiler::CompressWorker()
+{
+    SetThreadName( "Tracy DXT1" );
+    while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+    rpmalloc_thread_initialize();
+    for(;;)
+    {
+        const auto shouldExit = ShouldExit();
+
+        {
+            bool lockHeld = true;
+            while( !m_fiLock.try_lock() )
+            {
+                if( m_shutdownManual.load( std::memory_order_relaxed ) )
+                {
+                    lockHeld = false;
+                    break;
+                }
+            }
+            if( !m_fiQueue.empty() ) m_fiQueue.swap( m_fiDequeue );
+            if( lockHeld )
+            {
+                m_fiLock.unlock();
+            }
+        }
+
+        const auto sz = m_fiDequeue.size();
+        if( sz > 0 )
+        {
+            auto fi = m_fiDequeue.data();
+            auto end = fi + sz;
+            while( fi != end )
+            {
+                const auto w = fi->w;
+                const auto h = fi->h;
+                const auto csz = size_t( w * h / 2 );
+                auto etc1buf = (char*)tracy_malloc( csz );
+                CompressImageDxt1( (const char*)fi->image, etc1buf, w, h );
+                tracy_free( fi->image );
+
+                Magic magic;
+                auto token = GetToken();
+                auto& tail = token->get_tail_index();
+                auto item = token->enqueue_begin( magic );
+                MemWrite( &item->hdr.type, QueueType::FrameImage );
+                MemWrite( &item->frameImage.image, (uint64_t)etc1buf );
+                MemWrite( &item->frameImage.frame, fi->frame );
+                MemWrite( &item->frameImage.w, w );
+                MemWrite( &item->frameImage.h, h );
+                uint8_t flip = fi->flip;
+                MemWrite( &item->frameImage.flip, flip );
+                tail.store( magic + 1, std::memory_order_release );
+
+                fi++;
+            }
+            m_fiDequeue.clear();
+        }
+        else
+        {
+            std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) );
+        }
+
+        if( shouldExit )
+        {
+            return;
+        }
+    }
+}
+
+static void FreeAssociatedMemory( const QueueItem& item )
+{
+    if( item.hdr.idx >= (int)QueueType::Terminate ) return;
+
+    uint64_t ptr;
+    switch( item.hdr.type )
+    {
+    case QueueType::ZoneText:
+    case QueueType::ZoneName:
+        ptr = MemRead<uint64_t>( &item.zoneText.text );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::Message:
+    case QueueType::MessageColor:
+    case QueueType::MessageCallstack:
+    case QueueType::MessageColorCallstack:
+#ifndef TRACY_ON_DEMAND
+    case QueueType::MessageAppInfo:
+#endif
+        ptr = MemRead<uint64_t>( &item.message.text );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::ZoneBeginAllocSrcLoc:
+    case QueueType::ZoneBeginAllocSrcLocCallstack:
+        ptr = MemRead<uint64_t>( &item.zoneBegin.srcloc );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::CallstackMemory:
+        ptr = MemRead<uint64_t>( &item.callstackMemory.ptr );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::Callstack:
+        ptr = MemRead<uint64_t>( &item.callstack.ptr );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::CallstackAlloc:
+        ptr = MemRead<uint64_t>( &item.callstackAlloc.nativePtr );
+        tracy_free( (void*)ptr );
+        ptr = MemRead<uint64_t>( &item.callstackAlloc.ptr );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::FrameImage:
+        ptr = MemRead<uint64_t>( &item.frameImage.image );
+        tracy_free( (void*)ptr );
+        break;
+#ifdef TRACY_ON_DEMAND
+    case QueueType::MessageAppInfo:
+        // Don't free memory associated with deferred messages.
+        break;
+#endif
+    default:
+        break;
+    }
+}
+
+void Profiler::ClearQueues( moodycamel::ConsumerToken& token )
+{
+    for(;;)
+    {
+        const auto sz = GetQueue().try_dequeue_bulk( token, m_itemBuf, BulkSize );
+        if( sz == 0 ) break;
+        for( size_t i=0; i<sz; i++ ) FreeAssociatedMemory( m_itemBuf[i] );
+    }
+
+    ClearSerial();
+}
+
+void Profiler::ClearSerial()
+{
+    bool lockHeld = true;
+    while( !m_serialLock.try_lock() )
+    {
+        if( m_shutdownManual.load( std::memory_order_relaxed ) )
+        {
+            lockHeld = false;
+            break;
+        }
+    }
+    for( auto& v : m_serialQueue ) FreeAssociatedMemory( v );
+    m_serialQueue.clear();
+    if( lockHeld )
+    {
+        m_serialLock.unlock();
+    }
+
+    for( auto& v : m_serialDequeue ) FreeAssociatedMemory( v );
+    m_serialDequeue.clear();
+}
+
+Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
+{
+    uint64_t threadId;
+    const auto sz = GetQueue().try_dequeue_bulk_single( token, m_itemBuf, BulkSize, threadId );
+    if( sz > 0 )
+    {
+        if( threadId != m_threadCtx )
+        {
+            QueueItem item;
+            MemWrite( &item.hdr.type, QueueType::ThreadContext );
+            MemWrite( &item.threadCtx.thread, threadId );
+            if( !AppendData( &item, QueueDataSize[(int)QueueType::ThreadContext] ) ) return DequeueStatus::ConnectionLost;
+            m_threadCtx = threadId;
+            m_refTimeThread = 0;
+        }
+
+        auto end = m_itemBuf + sz;
+        auto item = m_itemBuf;
+        while( item != end )
+        {
+            uint64_t ptr;
+            const auto idx = MemRead<uint8_t>( &item->hdr.idx );
+            if( idx < (int)QueueType::Terminate )
+            {
+                switch( (QueueType)idx )
+                {
+                case QueueType::ZoneText:
+                case QueueType::ZoneName:
+                    ptr = MemRead<uint64_t>( &item->zoneText.text );
+                    SendString( ptr, (const char*)ptr, QueueType::CustomStringData );
+                    tracy_free( (void*)ptr );
+                    break;
+                case QueueType::Message:
+                case QueueType::MessageColor:
+                case QueueType::MessageCallstack:
+                case QueueType::MessageColorCallstack:
+                    ptr = MemRead<uint64_t>( &item->message.text );
+                    SendString( ptr, (const char*)ptr, QueueType::CustomStringData );
+                    tracy_free( (void*)ptr );
+                    break;
+                case QueueType::MessageAppInfo:
+                    ptr = MemRead<uint64_t>( &item->message.text );
+                    SendString( ptr, (const char*)ptr, QueueType::CustomStringData );
+#ifndef TRACY_ON_DEMAND
+                    tracy_free( (void*)ptr );
+#endif
+                    break;
+                case QueueType::ZoneBeginAllocSrcLoc:
+                case QueueType::ZoneBeginAllocSrcLocCallstack:
+                {
+                    int64_t t = MemRead<int64_t>( &item->zoneBegin.time );
+                    int64_t dt = t - m_refTimeThread;
+                    m_refTimeThread = t;
+                    MemWrite( &item->zoneBegin.time, dt );
+                    ptr = MemRead<uint64_t>( &item->zoneBegin.srcloc );
+                    SendSourceLocationPayload( ptr );
+                    tracy_free( (void*)ptr );
+                    break;
+                }
+                case QueueType::Callstack:
+                    ptr = MemRead<uint64_t>( &item->callstack.ptr );
+                    SendCallstackPayload( ptr );
+                    tracy_free( (void*)ptr );
+                    break;
+                case QueueType::CallstackAlloc:
+                    ptr = MemRead<uint64_t>( &item->callstackAlloc.nativePtr );
+                    if( ptr != 0 )
+                    {
+                        CutCallstack( (void*)ptr, "lua_pcall" );
+                        SendCallstackPayload( ptr );
+                        tracy_free( (void*)ptr );
+                    }
+                    ptr = MemRead<uint64_t>( &item->callstackAlloc.ptr );
+                    SendCallstackAlloc( ptr );
+                    tracy_free( (void*)ptr );
+                    break;
+                case QueueType::FrameImage:
+                {
+                    ptr = MemRead<uint64_t>( &item->frameImage.image );
+                    const auto w = MemRead<uint16_t>( &item->frameImage.w );
+                    const auto h = MemRead<uint16_t>( &item->frameImage.h );
+                    const auto csz = size_t( w * h / 2 );
+                    SendLongString( ptr, (const char*)ptr, csz, QueueType::FrameImageData );
+                    tracy_free( (void*)ptr );
+                    break;
+                }
+                case QueueType::ZoneBegin:
+                case QueueType::ZoneBeginCallstack:
+                {
+                    int64_t t = MemRead<int64_t>( &item->zoneBegin.time );
+                    int64_t dt = t - m_refTimeThread;
+                    m_refTimeThread = t;
+                    MemWrite( &item->zoneBegin.time, dt );
+                    break;
+                }
+                case QueueType::ZoneEnd:
+                {
+                    int64_t t = MemRead<int64_t>( &item->zoneEnd.time );
+                    int64_t dt = t - m_refTimeThread;
+                    m_refTimeThread = t;
+                    MemWrite( &item->zoneEnd.time, dt );
+                    break;
+                }
+                case QueueType::GpuZoneBegin:
+                case QueueType::GpuZoneBeginCallstack:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime );
+                    int64_t dt = t - m_refTimeThread;
+                    m_refTimeThread = t;
+                    MemWrite( &item->gpuZoneBegin.cpuTime, dt );
+                    break;
+                }
+                case QueueType::GpuZoneEnd:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuZoneEnd.cpuTime );
+                    int64_t dt = t - m_refTimeThread;
+                    m_refTimeThread = t;
+                    MemWrite( &item->gpuZoneEnd.cpuTime, dt );
+                    break;
+                }
+                case QueueType::PlotData:
+                {
+                    int64_t t = MemRead<int64_t>( &item->plotData.time );
+                    int64_t dt = t - m_refTimeThread;
+                    m_refTimeThread = t;
+                    MemWrite( &item->plotData.time, dt );
+                    break;
+                }
+                case QueueType::ContextSwitch:
+                {
+                    int64_t t = MemRead<int64_t>( &item->contextSwitch.time );
+                    int64_t dt = t - m_refTimeCtx;
+                    m_refTimeCtx = t;
+                    MemWrite( &item->contextSwitch.time, dt );
+                    break;
+                }
+                case QueueType::ThreadWakeup:
+                {
+                    int64_t t = MemRead<int64_t>( &item->threadWakeup.time );
+                    int64_t dt = t - m_refTimeCtx;
+                    m_refTimeCtx = t;
+                    MemWrite( &item->threadWakeup.time, dt );
+                    break;
+                }
+                case QueueType::GpuTime:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuTime.gpuTime );
+                    int64_t dt = t - m_refTimeGpu;
+                    m_refTimeGpu = t;
+                    MemWrite( &item->gpuTime.gpuTime, dt );
+                    break;
+                }
+                default:
+                    assert( false );
+                    break;
+                }
+            }
+            if( !AppendData( item, QueueDataSize[idx] ) ) return DequeueStatus::ConnectionLost;
+            item++;
+        }
+    }
+    else
+    {
+        return DequeueStatus::QueueEmpty;
+    }
+    return DequeueStatus::DataDequeued;
+}
+
+Profiler::DequeueStatus Profiler::DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop )
+{
+    const auto sz = GetQueue().try_dequeue_bulk( token, m_itemBuf, BulkSize );
+    if( sz > 0 )
+    {
+        auto end = m_itemBuf + sz;
+        auto item = m_itemBuf;
+        while( item != end )
+        {
+            FreeAssociatedMemory( *item );
+            const auto idx = MemRead<uint8_t>( &item->hdr.idx );
+            if( idx == (uint8_t)QueueType::ContextSwitch )
+            {
+                const auto csTime = MemRead<int64_t>( &item->contextSwitch.time );
+                if( csTime > timeStop )
+                {
+                    timeStop = -1;
+                    return DequeueStatus::DataDequeued;
+                }
+                int64_t dt = csTime - m_refTimeCtx;
+                m_refTimeCtx = csTime;
+                MemWrite( &item->contextSwitch.time, dt );
+                if( !AppendData( item, QueueDataSize[(int)QueueType::ContextSwitch] ) ) return DequeueStatus::ConnectionLost;
+            }
+            else if( idx == (uint8_t)QueueType::ThreadWakeup )
+            {
+                const auto csTime = MemRead<int64_t>( &item->threadWakeup.time );
+                if( csTime > timeStop )
+                {
+                    timeStop = -1;
+                    return DequeueStatus::DataDequeued;
+                }
+                int64_t dt = csTime - m_refTimeCtx;
+                m_refTimeCtx = csTime;
+                MemWrite( &item->threadWakeup.time, dt );
+                if( !AppendData( item, QueueDataSize[(int)QueueType::ThreadWakeup] ) ) return DequeueStatus::ConnectionLost;
+            }
+            item++;
+        }
+    }
+    else
+    {
+        return DequeueStatus::QueueEmpty;
+    }
+    return DequeueStatus::DataDequeued;
+}
+
+Profiler::DequeueStatus Profiler::DequeueSerial()
+{
+    {
+        bool lockHeld = true;
+        while( !m_serialLock.try_lock() )
+        {
+            if( m_shutdownManual.load( std::memory_order_relaxed ) )
+            {
+                lockHeld = false;
+                break;
+            }
+        }
+        if( !m_serialQueue.empty() ) m_serialQueue.swap( m_serialDequeue );
+        if( lockHeld )
+        {
+            m_serialLock.unlock();
+        }
+    }
+
+    const auto sz = m_serialDequeue.size();
+    if( sz > 0 )
+    {
+        auto item = m_serialDequeue.data();
+        auto end = item + sz;
+        while( item != end )
+        {
+            uint64_t ptr;
+            const auto idx = MemRead<uint8_t>( &item->hdr.idx );
+            if( idx < (int)QueueType::Terminate )
+            {
+                switch( (QueueType)idx )
+                {
+                case QueueType::CallstackMemory:
+                    ptr = MemRead<uint64_t>( &item->callstackMemory.ptr );
+                    SendCallstackPayload( ptr );
+                    tracy_free( (void*)ptr );
+                    break;
+                case QueueType::LockWait:
+                case QueueType::LockSharedWait:
+                {
+                    int64_t t = MemRead<int64_t>( &item->lockWait.time );
+                    int64_t dt = t - m_refTimeSerial;
+                    m_refTimeSerial = t;
+                    MemWrite( &item->lockWait.time, dt );
+                    break;
+                }
+                case QueueType::LockObtain:
+                case QueueType::LockSharedObtain:
+                {
+                    int64_t t = MemRead<int64_t>( &item->lockObtain.time );
+                    int64_t dt = t - m_refTimeSerial;
+                    m_refTimeSerial = t;
+                    MemWrite( &item->lockObtain.time, dt );
+                    break;
+                }
+                case QueueType::LockRelease:
+                case QueueType::LockSharedRelease:
+                {
+                    int64_t t = MemRead<int64_t>( &item->lockRelease.time );
+                    int64_t dt = t - m_refTimeSerial;
+                    m_refTimeSerial = t;
+                    MemWrite( &item->lockRelease.time, dt );
+                    break;
+                }
+                case QueueType::MemAlloc:
+                case QueueType::MemAllocCallstack:
+                {
+                    int64_t t = MemRead<int64_t>( &item->memAlloc.time );
+                    int64_t dt = t - m_refTimeSerial;
+                    m_refTimeSerial = t;
+                    MemWrite( &item->memAlloc.time, dt );
+                    break;
+                }
+                case QueueType::MemFree:
+                case QueueType::MemFreeCallstack:
+                {
+                    int64_t t = MemRead<int64_t>( &item->memFree.time );
+                    int64_t dt = t - m_refTimeSerial;
+                    m_refTimeSerial = t;
+                    MemWrite( &item->memFree.time, dt );
+                    break;
+                }
+                case QueueType::GpuZoneBeginSerial:
+                case QueueType::GpuZoneBeginCallstackSerial:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime );
+                    int64_t dt = t - m_refTimeSerial;
+                    m_refTimeSerial = t;
+                    MemWrite( &item->gpuZoneBegin.cpuTime, dt );
+                    break;
+                }
+                case QueueType::GpuZoneEndSerial:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuZoneEnd.cpuTime );
+                    int64_t dt = t - m_refTimeSerial;
+                    m_refTimeSerial = t;
+                    MemWrite( &item->gpuZoneEnd.cpuTime, dt );
+                    break;
+                }
+                case QueueType::GpuTime:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuTime.gpuTime );
+                    int64_t dt = t - m_refTimeGpu;
+                    m_refTimeGpu = t;
+                    MemWrite( &item->gpuTime.gpuTime, dt );
+                    break;
+                }
+                default:
+                    assert( false );
+                    break;
+                }
+            }
+            if( !AppendData( item, QueueDataSize[idx] ) ) return DequeueStatus::ConnectionLost;
+            item++;
+        }
+        m_serialDequeue.clear();
+    }
+    else
+    {
+        return DequeueStatus::QueueEmpty;
+    }
+    return DequeueStatus::DataDequeued;
+}
+
+bool Profiler::AppendData( const void* data, size_t len )
+{
+    const auto ret = NeedDataSize( len );
+    AppendDataUnsafe( data, len );
+    return ret;
+}
+
+bool Profiler::CommitData()
+{
+    bool ret = SendData( m_buffer + m_bufferStart, m_bufferOffset - m_bufferStart );
+    if( m_bufferOffset > TargetFrameSize * 2 ) m_bufferOffset = 0;
+    m_bufferStart = m_bufferOffset;
+    return ret;
+}
+
+bool Profiler::NeedDataSize( size_t len )
+{
+    assert( len <= TargetFrameSize );
+    bool ret = true;
+    if( m_bufferOffset - m_bufferStart + len > TargetFrameSize )
+    {
+        ret = CommitData();
+    }
+    return ret;
+}
+
+bool Profiler::SendData( const char* data, size_t len )
+{
+    const lz4sz_t lz4sz = LZ4_compress_fast_continue( (LZ4_stream_t*)m_stream, data, m_lz4Buf + sizeof( lz4sz_t ), (int)len, LZ4Size, 1 );
+    memcpy( m_lz4Buf, &lz4sz, sizeof( lz4sz ) );
+    return m_sock->Send( m_lz4Buf, lz4sz + sizeof( lz4sz_t ) ) != -1;
+}
+
+void Profiler::SendString( uint64_t str, const char* ptr, QueueType type )
+{
+    assert( type == QueueType::StringData ||
+            type == QueueType::ThreadName ||
+            type == QueueType::CustomStringData ||
+            type == QueueType::PlotName ||
+            type == QueueType::FrameName ||
+            type == QueueType::ExternalName ||
+            type == QueueType::ExternalThreadName );
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, type );
+    MemWrite( &item.stringTransfer.ptr, str );
+
+    auto len = strlen( ptr );
+    assert( len <= std::numeric_limits<uint16_t>::max() );
+    auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)type] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)type] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr, l16 );
+}
+
+void Profiler::SendLongString( uint64_t str, const char* ptr, size_t len, QueueType type )
+{
+    assert( type == QueueType::FrameImageData );
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, type );
+    MemWrite( &item.stringTransfer.ptr, str );
+
+    assert( len <= std::numeric_limits<uint32_t>::max() );
+    assert( QueueDataSize[(int)type] + sizeof( uint32_t ) + len <= TargetFrameSize );
+    auto l32 = uint32_t( len );
+
+    NeedDataSize( QueueDataSize[(int)type] + sizeof( l32 ) + l32 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)type] );
+    AppendDataUnsafe( &l32, sizeof( l32 ) );
+    AppendDataUnsafe( ptr, l32 );
+}
+
+void Profiler::SendSourceLocation( uint64_t ptr )
+{
+    auto srcloc = (const SourceLocationData*)ptr;
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SourceLocation );
+    MemWrite( &item.srcloc.name, (uint64_t)srcloc->name );
+    MemWrite( &item.srcloc.file, (uint64_t)srcloc->file );
+    MemWrite( &item.srcloc.function, (uint64_t)srcloc->function );
+    MemWrite( &item.srcloc.line, srcloc->line );
+    MemWrite( &item.srcloc.r, uint8_t( ( srcloc->color       ) & 0xFF ) );
+    MemWrite( &item.srcloc.g, uint8_t( ( srcloc->color >> 8  ) & 0xFF ) );
+    MemWrite( &item.srcloc.b, uint8_t( ( srcloc->color >> 16 ) & 0xFF ) );
+    AppendData( &item, QueueDataSize[(int)QueueType::SourceLocation] );
+}
+
+void Profiler::SendSourceLocationPayload( uint64_t _ptr )
+{
+    auto ptr = (const char*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SourceLocationPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    const auto len = *((uint32_t*)ptr);
+    assert( len <= std::numeric_limits<uint16_t>::max() );
+    assert( len > 4 );
+    const auto l16 = uint16_t( len - 4 );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::SourceLocationPayload] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SourceLocationPayload] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr + 4, l16 );
+}
+
+void Profiler::SendCallstackPayload( uint64_t _ptr )
+{
+    auto ptr = (uintptr_t*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::CallstackPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    const auto sz = *ptr++;
+    const auto len = sz * sizeof( uint64_t );
+    const auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::CallstackPayload] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackPayload] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+
+    if( compile_time_condition<sizeof( uintptr_t ) == sizeof( uint64_t )>::value )
+    {
+        AppendDataUnsafe( ptr, sizeof( uint64_t ) * sz );
+    }
+    else
+    {
+        for( uintptr_t i=0; i<sz; i++ )
+        {
+            const auto val = uint64_t( *ptr++ );
+            AppendDataUnsafe( &val, sizeof( uint64_t ) );
+        }
+    }
+}
+
+void Profiler::SendCallstackAlloc( uint64_t _ptr )
+{
+    auto ptr = (const char*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::CallstackAllocPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    const auto len = *((uint32_t*)ptr);
+    assert( len <= std::numeric_limits<uint16_t>::max() );
+    const auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::CallstackAllocPayload] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackAllocPayload] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr + 4, l16 );
+}
+
+void Profiler::SendCallstackFrame( uint64_t ptr )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    const auto frameData = DecodeCallstackPtr( ptr );
+
+    {
+        QueueItem item;
+        MemWrite( &item.hdr.type, QueueType::CallstackFrameSize );
+        MemWrite( &item.callstackFrameSize.ptr, ptr );
+        MemWrite( &item.callstackFrameSize.size, frameData.size );
+
+        AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrameSize] );
+    }
+
+    for( uint8_t i=0; i<frameData.size; i++ )
+    {
+        const auto& frame = frameData.data[i];
+
+        SendString( uint64_t( frame.name ), frame.name, QueueType::CustomStringData );
+        SendString( uint64_t( frame.file ), frame.file, QueueType::CustomStringData );
+
+        QueueItem item;
+        MemWrite( &item.hdr.type, QueueType::CallstackFrame );
+        MemWrite( &item.callstackFrame.name, (uint64_t)frame.name );
+        MemWrite( &item.callstackFrame.file, (uint64_t)frame.file );
+        MemWrite( &item.callstackFrame.line, frame.line );
+
+        AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrame] );
+
+        tracy_free( (void*)frame.name );
+        tracy_free( (void*)frame.file );
+    }
+#endif
+}
+
+
+static bool DontExit() { return false; }
+
+bool Profiler::HandleServerQuery()
+{
+    uint8_t type;
+    if( !m_sock->Read( &type, sizeof( type ), 10, DontExit ) ) return false;
+
+    uint64_t ptr;
+    if( !m_sock->Read( &ptr, sizeof( ptr ), 10, DontExit ) ) return false;
+
+    switch( type )
+    {
+    case ServerQueryString:
+        SendString( ptr, (const char*)ptr, QueueType::StringData );
+        break;
+    case ServerQueryThreadString:
+        if( ptr == m_mainThread )
+        {
+            SendString( ptr, "Main thread", QueueType::ThreadName );
+        }
+        else
+        {
+            SendString( ptr, GetThreadName( ptr ), QueueType::ThreadName );
+        }
+        break;
+    case ServerQuerySourceLocation:
+        SendSourceLocation( ptr );
+        break;
+    case ServerQueryPlotName:
+        SendString( ptr, (const char*)ptr, QueueType::PlotName );
+        break;
+    case ServerQueryTerminate:
+        return false;
+    case ServerQueryCallstackFrame:
+        SendCallstackFrame( ptr );
+        break;
+    case ServerQueryFrameName:
+        SendString( ptr, (const char*)ptr, QueueType::FrameName );
+        break;
+    case ServerQueryDisconnect:
+        HandleDisconnect();
+        return false;
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    case ServerQueryExternalName:
+        SysTraceSendExternalName( ptr );
+        break;
+#endif
+    case ServerQueryParameter:
+        HandleParameter( ptr );
+        break;
+    default:
+        assert( false );
+        break;
+    }
+
+    return true;
+}
+
+void Profiler::HandleDisconnect()
+{
+    moodycamel::ConsumerToken token( GetQueue() );
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    if( s_sysTraceThread )
+    {
+        auto timestamp = GetTime();
+        for(;;)
+        {
+            const auto status = DequeueContextSwitches( token, timestamp );
+            if( status == DequeueStatus::ConnectionLost )
+            {
+                return;
+            }
+            else if( status == DequeueStatus::QueueEmpty )
+            {
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+            }
+            if( timestamp < 0 )
+            {
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+                break;
+            }
+            ClearSerial();
+            if( m_sock->HasData() )
+            {
+                while( m_sock->HasData() )
+                {
+                    if( !HandleServerQuery() ) return;
+                }
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+            }
+            else
+            {
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+                std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+            }
+        }
+    }
+#endif
+
+    QueueItem terminate;
+    MemWrite( &terminate.hdr.type, QueueType::Terminate );
+    if( !SendData( (const char*)&terminate, 1 ) ) return;
+    for(;;)
+    {
+        ClearQueues( token );
+        if( m_sock->HasData() )
+        {
+            while( m_sock->HasData() )
+            {
+                if( !HandleServerQuery() ) return;
+            }
+            if( m_bufferOffset != m_bufferStart )
+            {
+                if( !CommitData() ) return;
+            }
+        }
+        else
+        {
+            if( m_bufferOffset != m_bufferStart )
+            {
+                if( !CommitData() ) return;
+            }
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+        }
+    }
+}
+
+void Profiler::CalibrateTimer()
+{
+#ifdef TRACY_HW_TIMER
+#  if !defined TARGET_OS_IOS && __ARM_ARCH >= 6
+    m_timerMul = 1.;
+#  else
+    std::atomic_signal_fence( std::memory_order_acq_rel );
+    const auto t0 = std::chrono::high_resolution_clock::now();
+    const auto r0 = GetTime();
+    std::atomic_signal_fence( std::memory_order_acq_rel );
+    std::this_thread::sleep_for( std::chrono::milliseconds( 200 ) );
+    std::atomic_signal_fence( std::memory_order_acq_rel );
+    const auto t1 = std::chrono::high_resolution_clock::now();
+    const auto r1 = GetTime();
+    std::atomic_signal_fence( std::memory_order_acq_rel );
+
+    const auto dt = std::chrono::duration_cast<std::chrono::nanoseconds>( t1 - t0 ).count();
+    const auto dr = r1 - r0;
+
+    m_timerMul = double( dt ) / double( dr );
+#  endif
+#else
+    m_timerMul = 1.;
+#endif
+}
+
+void Profiler::CalibrateDelay()
+{
+    enum { Iterations = 50000 };
+
+    auto mindiff = std::numeric_limits<int64_t>::max();
+    for( int i=0; i<Iterations * 10; i++ )
+    {
+        const auto t0i = GetTime();
+        const auto t1i = GetTime();
+        const auto dti = t1i - t0i;
+        if( dti > 0 && dti < mindiff ) mindiff = dti;
+    }
+    m_resolution = mindiff;
+
+#ifdef TRACY_DELAYED_INIT
+    m_delay = m_resolution;
+#else
+    enum { Events = Iterations * 2 };   // start + end
+    static_assert( Events < QueuePrealloc, "Delay calibration loop will allocate memory in queue" );
+
+    moodycamel::ProducerToken ptoken_detail( GetQueue() );
+    moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptoken = GetQueue().get_explicit_producer( ptoken_detail );
+    static const tracy::SourceLocationData __tracy_source_location { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 };
+    const auto t0 = GetTime();
+    for( int i=0; i<Iterations; i++ )
+    {
+        {
+            Magic magic;
+            auto& tail = ptoken->get_tail_index();
+            auto item = ptoken->enqueue_begin( magic );
+            MemWrite( &item->hdr.type, QueueType::ZoneBegin );
+            MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+            MemWrite( &item->zoneBegin.srcloc, (uint64_t)&__tracy_source_location );
+            tail.store( magic + 1, std::memory_order_release );
+        }
+        {
+            Magic magic;
+            auto& tail = ptoken->get_tail_index();
+            auto item = ptoken->enqueue_begin( magic );
+            MemWrite( &item->hdr.type, QueueType::ZoneEnd );
+            MemWrite( &item->zoneEnd.time, GetTime() );
+            tail.store( magic + 1, std::memory_order_release );
+        }
+    }
+    const auto t1 = GetTime();
+    const auto dt = t1 - t0;
+    m_delay = dt / Events;
+
+    enum { Bulk = 1000 };
+    moodycamel::ConsumerToken token( GetQueue() );
+    int left = Events;
+    QueueItem item[Bulk];
+    while( left != 0 )
+    {
+        const auto sz = GetQueue().try_dequeue_bulk( token, item, std::min( left, (int)Bulk ) );
+        assert( sz > 0 );
+        left -= (int)sz;
+    }
+    assert( GetQueue().size_approx() == 0 );
+#endif
+}
+
+void Profiler::SendCallstack( int depth, const char* skipBefore )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    auto ptr = Callstack( depth );
+    CutCallstack( ptr, skipBefore );
+
+    Magic magic;
+    auto token = GetToken();
+    auto& tail = token->get_tail_index();
+    auto item = token->enqueue_begin( magic );
+    MemWrite( &item->hdr.type, QueueType::Callstack );
+    MemWrite( &item->callstack.ptr, ptr );
+    tail.store( magic + 1, std::memory_order_release );
+#endif
+}
+
+void Profiler::CutCallstack( void* callstack, const char* skipBefore )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    auto data = (uintptr_t*)callstack;
+    const auto sz = *data++;
+    uintptr_t i;
+    for( i=0; i<sz; i++ )
+    {
+        auto name = DecodeCallstackPtrFast( uint64_t( data[i] ) );
+        const bool found = strcmp( name, skipBefore ) == 0;
+        if( found )
+        {
+            i++;
+            break;
+        }
+    }
+
+    if( i != sz )
+    {
+        memmove( data, data + i, ( sz - i ) * sizeof( uintptr_t* ) );
+        *--data = sz - i;
+    }
+#endif
+}
+
+#ifdef TRACY_HAS_SYSTIME
+void Profiler::ProcessSysTime()
+{
+    if( m_shutdown.load( std::memory_order_relaxed ) ) return;
+    auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+    if( t - m_sysTimeLast > 100000000 )    // 100 ms
+    {
+        auto sysTime = m_sysTime.Get();
+        if( sysTime >= 0 )
+        {
+            m_sysTimeLast = t;
+
+            Magic magic;
+            auto token = GetToken();
+            auto& tail = token->get_tail_index();
+            auto item = token->enqueue_begin( magic );
+            MemWrite( &item->hdr.type, QueueType::SysTimeReport );
+            MemWrite( &item->sysTime.time, GetTime() );
+            MemWrite( &item->sysTime.sysTime, sysTime );
+            tail.store( magic + 1, std::memory_order_release );
+        }
+    }
+}
+#endif
+
+void Profiler::ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val )
+{
+    tracy::Magic magic;
+    auto token = tracy::GetToken();
+    auto& tail = token->get_tail_index();
+    auto item = token->enqueue_begin( magic );
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::ParamSetup );
+    tracy::MemWrite( &item->paramSetup.idx, idx );
+    tracy::MemWrite( &item->paramSetup.name, (uint64_t)name );
+    tracy::MemWrite( &item->paramSetup.isBool, (uint8_t)isBool );
+    tracy::MemWrite( &item->paramSetup.val, val );
+
+#ifdef TRACY_ON_DEMAND
+    GetProfiler().DeferItem( *item );
+#endif
+
+    tail.store( magic + 1, std::memory_order_release );
+}
+
+void Profiler::HandleParameter( uint64_t payload )
+{
+    assert( m_paramCallback );
+    const auto idx = uint32_t( payload >> 32 );
+    const auto val = int32_t( payload & 0xFFFFFFFF );
+    m_paramCallback( idx, val );
+}
+
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int active )
+{
+    ___tracy_c_zone_context ctx;
+#ifdef TRACY_ON_DEMAND
+    ctx.active = active && tracy::GetProfiler().IsConnected();
+#else
+    ctx.active = active;
+#endif
+    if( !ctx.active ) return ctx;
+    const auto id = tracy::GetProfiler().GetNextZoneId();
+    ctx.id = id;
+
+#ifndef TRACY_NO_VERIFY
+    {
+        tracy::Magic magic;
+        auto token = tracy::GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, id );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+#endif
+    {
+        tracy::Magic magic;
+        auto token = tracy::GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneBegin );
+        tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() );
+        tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+    return ctx;
+}
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int depth, int active )
+{
+    ___tracy_c_zone_context ctx;
+#ifdef TRACY_ON_DEMAND
+    ctx.active = active && tracy::GetProfiler().IsConnected();
+#else
+    ctx.active = active;
+#endif
+    if( !ctx.active ) return ctx;
+    const auto id = tracy::GetProfiler().GetNextZoneId();
+    ctx.id = id;
+
+#ifndef TRACY_NO_VERIFY
+    {
+        tracy::Magic magic;
+        auto token = tracy::GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, id );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+#endif
+    {
+        tracy::Magic magic;
+        auto token = tracy::GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneBeginCallstack );
+        tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() );
+        tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    tracy::GetProfiler().SendCallstack( depth );
+    return ctx;
+}
+
+TRACY_API void ___tracy_emit_zone_end( TracyCZoneCtx ctx )
+{
+    if( !ctx.active ) return;
+#ifndef TRACY_NO_VERIFY
+    {
+        tracy::Magic magic;
+        auto token = tracy::GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+#endif
+    {
+        tracy::Magic magic;
+        auto token = tracy::GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneEnd );
+        tracy::MemWrite( &item->zoneEnd.time, tracy::Profiler::GetTime() );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+}
+
+TRACY_API void ___tracy_emit_zone_text( TracyCZoneCtx ctx, const char* txt, size_t size )
+{
+    if( !ctx.active ) return;
+    auto ptr = (char*)tracy::tracy_malloc( size+1 );
+    memcpy( ptr, txt, size );
+    ptr[size] = '\0';
+#ifndef TRACY_NO_VERIFY
+    {
+        tracy::Magic magic;
+        auto token = tracy::GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+#endif
+    {
+        tracy::Magic magic;
+        auto token = tracy::GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneText );
+        tracy::MemWrite( &item->zoneText.text, (uint64_t)ptr );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+}
+
+TRACY_API void ___tracy_emit_zone_name( TracyCZoneCtx ctx, const char* txt, size_t size )
+{
+    if( !ctx.active ) return;
+    auto ptr = (char*)tracy::tracy_malloc( size+1 );
+    memcpy( ptr, txt, size );
+    ptr[size] = '\0';
+#ifndef TRACY_NO_VERIFY
+    {
+        tracy::Magic magic;
+        auto token = tracy::GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+#endif
+    {
+        tracy::Magic magic;
+        auto token = tracy::GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneName );
+        tracy::MemWrite( &item->zoneText.text, (uint64_t)ptr );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+}
+
+TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size ) { tracy::Profiler::MemAlloc( ptr, size ); }
+TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int depth ) { tracy::Profiler::MemAllocCallstack( ptr, size, depth ); }
+TRACY_API void ___tracy_emit_memory_free( const void* ptr ) { tracy::Profiler::MemFree( ptr ); }
+TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int depth ) { tracy::Profiler::MemFreeCallstack( ptr, depth ); }
+TRACY_API void ___tracy_emit_frame_mark( const char* name ) { tracy::Profiler::SendFrameMark( name ); }
+TRACY_API void ___tracy_emit_frame_mark_start( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart ); }
+TRACY_API void ___tracy_emit_frame_mark_end( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd ); }
+TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int flip ) { tracy::Profiler::SendFrameImage( image, w, h, offset, flip ); }
+TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); }
+TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); }
+TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); }
+TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); }
+TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, color, callstack ); }
+TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ) { tracy::Profiler::MessageAppInfo( txt, size ); }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libs/tracy/client/TracyProfiler.hpp b/libs/tracy/client/TracyProfiler.hpp
@@ -0,0 +1,620 @@
+#ifndef __TRACYPROFILER_HPP__
+#define __TRACYPROFILER_HPP__
+
+#include <assert.h>
+#include <atomic>
+#include <stdint.h>
+#include <string.h>
+
+#include "tracy_concurrentqueue.h"
+#include "TracyCallstack.hpp"
+#include "TracySysTime.hpp"
+#include "TracyFastVector.hpp"
+#include "../common/TracyQueue.hpp"
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracyMutex.hpp"
+
+#if defined _WIN32 || defined __CYGWIN__
+#  include <intrin.h>
+#endif
+#ifdef __APPLE__
+#  include <TargetConditionals.h>
+#  include <mach/mach_time.h>
+#endif
+
+#if defined _WIN32 || defined __CYGWIN__ || ( ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) && !defined __ANDROID__ ) || __ARM_ARCH >= 6
+#  define TRACY_HW_TIMER
+#endif
+
+#if !defined TRACY_HW_TIMER || ( __ARM_ARCH >= 6 && !defined CLOCK_MONOTONIC_RAW )
+  #include <chrono>
+#endif
+
+#ifndef TracyConcat
+#  define TracyConcat(x,y) TracyConcatIndirect(x,y)
+#endif
+#ifndef TracyConcatIndirect
+#  define TracyConcatIndirect(x,y) x##y
+#endif
+
+namespace tracy
+{
+
+class GpuCtx;
+class Profiler;
+class Socket;
+class UdpBroadcast;
+
+struct GpuCtxWrapper
+{
+    GpuCtx* ptr;
+};
+
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken();
+TRACY_API Profiler& GetProfiler();
+TRACY_API std::atomic<uint32_t>& GetLockCounter();
+TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter();
+TRACY_API GpuCtxWrapper& GetGpuCtx();
+TRACY_API uint64_t GetThreadHandle();
+
+TRACY_API void InitRPMallocThread();
+
+struct SourceLocationData
+{
+    const char* name;
+    const char* function;
+    const char* file;
+    uint32_t line;
+    uint32_t color;
+};
+
+#ifdef TRACY_ON_DEMAND
+struct LuaZoneState
+{
+    uint32_t counter;
+    bool active;
+};
+#endif
+
+using Magic = moodycamel::ConcurrentQueueDefaultTraits::index_t;
+
+
+typedef void(*ParameterCallback)( uint32_t idx, int32_t val );
+
+class Profiler
+{
+    struct FrameImageQueueItem
+    {
+        void* image;
+        uint64_t frame;
+        uint16_t w;
+        uint16_t h;
+        uint8_t offset;
+        bool flip;
+    };
+
+public:
+    Profiler();
+    ~Profiler();
+
+    static tracy_force_inline int64_t GetTime()
+    {
+#ifdef TRACY_HW_TIMER
+#  if TARGET_OS_IOS == 1
+        return mach_absolute_time();
+#  elif __ARM_ARCH >= 6
+#    ifdef CLOCK_MONOTONIC_RAW
+        struct timespec ts;
+        clock_gettime( CLOCK_MONOTONIC_RAW, &ts );
+        return int64_t( ts.tv_sec ) * 1000000000ll + int64_t( ts.tv_nsec );
+#    else
+        return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count();
+#    endif
+#  elif defined _WIN32 || defined __CYGWIN__
+        return int64_t( __rdtsc() );
+#  elif defined __i386 || defined _M_IX86
+        uint32_t eax, edx;
+        asm volatile ( "rdtsc" : "=a" (eax), "=d" (edx) );
+        return ( uint64_t( edx ) << 32 ) + uint64_t( eax );
+#  elif defined __x86_64__ || defined _M_X64
+        uint64_t rax, rdx;
+        asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
+        return ( rdx << 32 ) + rax;
+#  endif
+#else
+        return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count();
+#endif
+    }
+
+    tracy_force_inline uint32_t GetNextZoneId()
+    {
+        return m_zoneId.fetch_add( 1, std::memory_order_relaxed );
+    }
+
+    static tracy_force_inline QueueItem* QueueSerial()
+    {
+        auto& p = GetProfiler();
+        p.m_serialLock.lock();
+        return p.m_serialQueue.prepare_next();
+    }
+
+    static tracy_force_inline void QueueSerialFinish()
+    {
+        auto& p = GetProfiler();
+        p.m_serialQueue.commit_next();
+        p.m_serialLock.unlock();
+    }
+
+    static tracy_force_inline void SendFrameMark( const char* name )
+    {
+        if( !name ) GetProfiler().m_frameCount.fetch_add( 1, std::memory_order_relaxed );
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::FrameMarkMsg );
+        MemWrite( &item->frameMark.time, GetTime() );
+        MemWrite( &item->frameMark.name, uint64_t( name ) );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    static tracy_force_inline void SendFrameMark( const char* name, QueueType type )
+    {
+        assert( type == QueueType::FrameMarkMsgStart || type == QueueType::FrameMarkMsgEnd );
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        auto item = QueueSerial();
+        MemWrite( &item->hdr.type, type );
+        MemWrite( &item->frameMark.time, GetTime() );
+        MemWrite( &item->frameMark.name, uint64_t( name ) );
+        QueueSerialFinish();
+    }
+
+    static tracy_force_inline void SendFrameImage( const void* image, uint16_t w, uint16_t h, uint8_t offset, bool flip )
+    {
+        auto& profiler = GetProfiler();
+#ifdef TRACY_ON_DEMAND
+        if( !profiler.IsConnected() ) return;
+#endif
+        const auto sz = size_t( w ) * size_t( h ) * 4;
+        auto ptr = (char*)tracy_malloc( sz );
+        memcpy( ptr, image, sz );
+
+        profiler.m_fiLock.lock();
+        auto fi = profiler.m_fiQueue.prepare_next();
+        fi->image = ptr;
+        fi->frame = profiler.m_frameCount.load( std::memory_order_relaxed ) - offset;
+        fi->w = w;
+        fi->h = h;
+        fi->flip = flip;
+        profiler.m_fiQueue.commit_next();
+        profiler.m_fiLock.unlock();
+    }
+
+    static tracy_force_inline void PlotData( const char* name, int64_t val )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::PlotData );
+        MemWrite( &item->plotData.name, (uint64_t)name );
+        MemWrite( &item->plotData.time, GetTime() );
+        MemWrite( &item->plotData.type, PlotDataType::Int );
+        MemWrite( &item->plotData.data.i, val );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    static tracy_force_inline void PlotData( const char* name, float val )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::PlotData );
+        MemWrite( &item->plotData.name, (uint64_t)name );
+        MemWrite( &item->plotData.time, GetTime() );
+        MemWrite( &item->plotData.type, PlotDataType::Float );
+        MemWrite( &item->plotData.data.f, val );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    static tracy_force_inline void PlotData( const char* name, double val )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::PlotData );
+        MemWrite( &item->plotData.name, (uint64_t)name );
+        MemWrite( &item->plotData.time, GetTime() );
+        MemWrite( &item->plotData.type, PlotDataType::Double );
+        MemWrite( &item->plotData.data.d, val );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    static tracy_force_inline void ConfigurePlot( const char* name, PlotFormatType type )
+    {
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::PlotConfig );
+        MemWrite( &item->plotConfig.name, (uint64_t)name );
+        MemWrite( &item->plotConfig.type, (uint8_t)type );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    static tracy_force_inline void Message( const char* txt, size_t size, int callstack )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto ptr = (char*)tracy_malloc( size+1 );
+        memcpy( ptr, txt, size );
+        ptr[size] = '\0';
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, callstack == 0 ? QueueType::Message : QueueType::MessageCallstack );
+        MemWrite( &item->message.time, GetTime() );
+        MemWrite( &item->message.text, (uint64_t)ptr );
+        tail.store( magic + 1, std::memory_order_release );
+
+        if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack );
+    }
+
+    static tracy_force_inline void Message( const char* txt, int callstack )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack );
+        MemWrite( &item->message.time, GetTime() );
+        MemWrite( &item->message.text, (uint64_t)txt );
+        tail.store( magic + 1, std::memory_order_release );
+
+        if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack );
+    }
+
+    static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int callstack )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto ptr = (char*)tracy_malloc( size+1 );
+        memcpy( ptr, txt, size );
+        ptr[size] = '\0';
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack );
+        MemWrite( &item->messageColor.time, GetTime() );
+        MemWrite( &item->messageColor.text, (uint64_t)ptr );
+        MemWrite( &item->messageColor.r, uint8_t( ( color       ) & 0xFF ) );
+        MemWrite( &item->messageColor.g, uint8_t( ( color >> 8  ) & 0xFF ) );
+        MemWrite( &item->messageColor.b, uint8_t( ( color >> 16 ) & 0xFF ) );
+        tail.store( magic + 1, std::memory_order_release );
+
+        if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack );
+    }
+
+    static tracy_force_inline void MessageColor( const char* txt, uint32_t color, int callstack )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack );
+        MemWrite( &item->messageColor.time, GetTime() );
+        MemWrite( &item->messageColor.text, (uint64_t)txt );
+        MemWrite( &item->messageColor.r, uint8_t( ( color       ) & 0xFF ) );
+        MemWrite( &item->messageColor.g, uint8_t( ( color >> 8  ) & 0xFF ) );
+        MemWrite( &item->messageColor.b, uint8_t( ( color >> 16 ) & 0xFF ) );
+        tail.store( magic + 1, std::memory_order_release );
+
+        if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack );
+    }
+
+    static tracy_force_inline void MessageAppInfo( const char* txt, size_t size )
+    {
+        Magic magic;
+        auto token = GetToken();
+        auto ptr = (char*)tracy_malloc( size+1 );
+        memcpy( ptr, txt, size );
+        ptr[size] = '\0';
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::MessageAppInfo );
+        MemWrite( &item->message.time, GetTime() );
+        MemWrite( &item->message.text, (uint64_t)ptr );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    static tracy_force_inline void MemAlloc( const void* ptr, size_t size )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        const auto thread = GetThreadHandle();
+
+        GetProfiler().m_serialLock.lock();
+        SendMemAlloc( QueueType::MemAlloc, thread, ptr, size );
+        GetProfiler().m_serialLock.unlock();
+    }
+
+    static tracy_force_inline void MemFree( const void* ptr )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        const auto thread = GetThreadHandle();
+
+        GetProfiler().m_serialLock.lock();
+        SendMemFree( QueueType::MemFree, thread, ptr );
+        GetProfiler().m_serialLock.unlock();
+    }
+
+    static tracy_force_inline void MemAllocCallstack( const void* ptr, size_t size, int depth )
+    {
+#ifdef TRACY_HAS_CALLSTACK
+        auto& profiler = GetProfiler();
+#  ifdef TRACY_ON_DEMAND
+        if( !profiler.IsConnected() ) return;
+#  endif
+        const auto thread = GetThreadHandle();
+
+        rpmalloc_thread_initialize();
+        auto callstack = Callstack( depth );
+
+        profiler.m_serialLock.lock();
+        SendMemAlloc( QueueType::MemAllocCallstack, thread, ptr, size );
+        SendCallstackMemory( callstack );
+        profiler.m_serialLock.unlock();
+#else
+        MemAlloc( ptr, size );
+#endif
+    }
+
+    static tracy_force_inline void MemFreeCallstack( const void* ptr, int depth )
+    {
+#ifdef TRACY_HAS_CALLSTACK
+        auto& profiler = GetProfiler();
+#  ifdef TRACY_ON_DEMAND
+        if( !profiler.IsConnected() ) return;
+#  endif
+        const auto thread = GetThreadHandle();
+
+        rpmalloc_thread_initialize();
+        auto callstack = Callstack( depth );
+
+        profiler.m_serialLock.lock();
+        SendMemFree( QueueType::MemFreeCallstack, thread, ptr );
+        SendCallstackMemory( callstack );
+        profiler.m_serialLock.unlock();
+#else
+        MemFree( ptr );
+#endif
+    }
+
+    static tracy_force_inline void SendCallstack( int depth )
+    {
+#ifdef TRACY_HAS_CALLSTACK
+        auto ptr = Callstack( depth );
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::Callstack );
+        MemWrite( &item->callstack.ptr, ptr );
+        tail.store( magic + 1, std::memory_order_release );
+#endif
+    }
+
+    static void ParameterRegister( ParameterCallback cb ) { GetProfiler().m_paramCallback = cb; }
+    static void ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val );
+
+    void SendCallstack( int depth, const char* skipBefore );
+    static void CutCallstack( void* callstack, const char* skipBefore );
+
+    static bool ShouldExit();
+
+#ifdef TRACY_ON_DEMAND
+    tracy_force_inline bool IsConnected() const
+    {
+        return m_isConnected.load( std::memory_order_acquire );
+    }
+
+    tracy_force_inline uint64_t ConnectionId() const
+    {
+        return m_connectionId.load( std::memory_order_acquire );
+    }
+
+    tracy_force_inline void DeferItem( const QueueItem& item )
+    {
+        m_deferredLock.lock();
+        auto dst = m_deferredQueue.push_next();
+        memcpy( dst, &item, sizeof( item ) );
+        m_deferredLock.unlock();
+    }
+#endif
+
+    void RequestShutdown() { m_shutdown.store( true, std::memory_order_relaxed ); m_shutdownManual.store( true, std::memory_order_relaxed ); }
+    bool HasShutdownFinished() const { return m_shutdownFinished.load( std::memory_order_relaxed ); }
+
+    void SendString( uint64_t ptr, const char* str, QueueType type );
+
+private:
+    enum class DequeueStatus { DataDequeued, ConnectionLost, QueueEmpty };
+
+    static void LaunchWorker( void* ptr ) { ((Profiler*)ptr)->Worker(); }
+    void Worker();
+
+    static void LaunchCompressWorker( void* ptr ) { ((Profiler*)ptr)->CompressWorker(); }
+    void CompressWorker();
+
+    void ClearQueues( tracy::moodycamel::ConsumerToken& token );
+    void ClearSerial();
+    DequeueStatus Dequeue( tracy::moodycamel::ConsumerToken& token );
+    DequeueStatus DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop );
+    DequeueStatus DequeueSerial();
+    bool AppendData( const void* data, size_t len );
+    bool CommitData();
+    bool NeedDataSize( size_t len );
+
+    tracy_force_inline void AppendDataUnsafe( const void* data, size_t len )
+    {
+        memcpy( m_buffer + m_bufferOffset, data, len );
+        m_bufferOffset += int( len );
+    }
+
+    bool SendData( const char* data, size_t len );
+    void SendLongString( uint64_t ptr, const char* str, size_t len, QueueType type );
+    void SendSourceLocation( uint64_t ptr );
+    void SendSourceLocationPayload( uint64_t ptr );
+    void SendCallstackPayload( uint64_t ptr );
+    void SendCallstackAlloc( uint64_t ptr );
+    void SendCallstackFrame( uint64_t ptr );
+
+    bool HandleServerQuery();
+    void HandleDisconnect();
+    void HandleParameter( uint64_t payload );
+
+    void CalibrateTimer();
+    void CalibrateDelay();
+
+    static tracy_force_inline void SendCallstackMemory( void* ptr )
+    {
+#ifdef TRACY_HAS_CALLSTACK
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, QueueType::CallstackMemory );
+        MemWrite( &item->callstackMemory.ptr, (uint64_t)ptr );
+        GetProfiler().m_serialQueue.commit_next();
+#endif
+    }
+
+    static tracy_force_inline void SendMemAlloc( QueueType type, const uint64_t thread, const void* ptr, size_t size )
+    {
+        assert( type == QueueType::MemAlloc || type == QueueType::MemAllocCallstack );
+
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, type );
+        MemWrite( &item->memAlloc.time, GetTime() );
+        MemWrite( &item->memAlloc.thread, thread );
+        MemWrite( &item->memAlloc.ptr, (uint64_t)ptr );
+        if( compile_time_condition<sizeof( size ) == 4>::value )
+        {
+            memcpy( &item->memAlloc.size, &size, 4 );
+            memset( &item->memAlloc.size + 4, 0, 2 );
+        }
+        else
+        {
+            assert( sizeof( size ) == 8 );
+            memcpy( &item->memAlloc.size, &size, 6 );
+        }
+        GetProfiler().m_serialQueue.commit_next();
+    }
+
+    static tracy_force_inline void SendMemFree( QueueType type, const uint64_t thread, const void* ptr )
+    {
+        assert( type == QueueType::MemFree || type == QueueType::MemFreeCallstack );
+
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, type );
+        MemWrite( &item->memFree.time, GetTime() );
+        MemWrite( &item->memFree.thread, thread );
+        MemWrite( &item->memFree.ptr, (uint64_t)ptr );
+        GetProfiler().m_serialQueue.commit_next();
+    }
+
+    double m_timerMul;
+    uint64_t m_resolution;
+    uint64_t m_delay;
+    std::atomic<int64_t> m_timeBegin;
+    uint64_t m_mainThread;
+    uint64_t m_epoch;
+    std::atomic<bool> m_shutdown;
+    std::atomic<bool> m_shutdownManual;
+    std::atomic<bool> m_shutdownFinished;
+    Socket* m_sock;
+    UdpBroadcast* m_broadcast;
+    bool m_noExit;
+    std::atomic<uint32_t> m_zoneId;
+
+    uint64_t m_threadCtx;
+    int64_t m_refTimeThread;
+    int64_t m_refTimeSerial;
+    int64_t m_refTimeCtx;
+    int64_t m_refTimeGpu;
+
+    void* m_stream;     // LZ4_stream_t*
+    char* m_buffer;
+    int m_bufferOffset;
+    int m_bufferStart;
+
+    QueueItem* m_itemBuf;
+    char* m_lz4Buf;
+
+    FastVector<QueueItem> m_serialQueue, m_serialDequeue;
+    TracyMutex m_serialLock;
+
+    FastVector<FrameImageQueueItem> m_fiQueue, m_fiDequeue;
+    TracyMutex m_fiLock;
+
+    std::atomic<uint64_t> m_frameCount;
+#ifdef TRACY_ON_DEMAND
+    std::atomic<bool> m_isConnected;
+    std::atomic<uint64_t> m_connectionId;
+
+    TracyMutex m_deferredLock;
+    FastVector<QueueItem> m_deferredQueue;
+#endif
+
+#ifdef TRACY_HAS_SYSTIME
+    void ProcessSysTime();
+
+    SysTime m_sysTime;
+    uint64_t m_sysTimeLast = 0;
+#else
+    void ProcessSysTime() {}
+#endif
+
+    ParameterCallback m_paramCallback;
+};
+
+};
+
+#endif
diff --git a/libs/tracy/client/TracyScoped.hpp b/libs/tracy/client/TracyScoped.hpp
@@ -0,0 +1,119 @@
+#ifndef __TRACYSCOPED_HPP__
+#define __TRACYSCOPED_HPP__
+
+#include <stdint.h>
+#include <string.h>
+
+#include "../common/TracySystem.hpp"
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
+#include "TracyProfiler.hpp"
+
+namespace tracy
+{
+
+class ScopedZone
+{
+public:
+    tracy_force_inline ScopedZone( const SourceLocationData* srcloc, bool is_active = true )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+        , m_connectionId( GetProfiler().ConnectionId() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::ZoneBegin );
+        MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+        MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    tracy_force_inline ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active = true )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+        , m_connectionId( GetProfiler().ConnectionId() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::ZoneBeginCallstack );
+        MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+        MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+        tail.store( magic + 1, std::memory_order_release );
+
+        GetProfiler().SendCallstack( depth );
+    }
+
+    tracy_force_inline ~ScopedZone()
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::ZoneEnd );
+        MemWrite( &item->zoneEnd.time, Profiler::GetTime() );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    tracy_force_inline void Text( const char* txt, size_t size )
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto ptr = (char*)tracy_malloc( size+1 );
+        memcpy( ptr, txt, size );
+        ptr[size] = '\0';
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::ZoneText );
+        MemWrite( &item->zoneText.text, (uint64_t)ptr );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    tracy_force_inline void Name( const char* txt, size_t size )
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+        Magic magic;
+        auto token = GetToken();
+        auto ptr = (char*)tracy_malloc( size+1 );
+        memcpy( ptr, txt, size );
+        ptr[size] = '\0';
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::ZoneName );
+        MemWrite( &item->zoneText.text, (uint64_t)ptr );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+private:
+    const bool m_active;
+
+#ifdef TRACY_ON_DEMAND
+    uint64_t m_connectionId;
+#endif
+};
+
+}
+
+#endif
diff --git a/libs/tracy/client/TracySysTime.cpp b/libs/tracy/client/TracySysTime.cpp
@@ -0,0 +1,105 @@
+#include "TracySysTime.hpp"
+
+#ifdef TRACY_HAS_SYSTIME
+
+#  if defined _WIN32 || defined __CYGWIN__
+#    include <windows.h>
+#  elif defined __linux__
+#    include <stdio.h>
+#    include <inttypes.h>
+#  elif defined __APPLE__
+#    include <mach/mach_host.h>
+#    include <mach/host_info.h>
+#  elif defined BSD
+#    include <sys/types.h>
+#    include <sys/sysctl.h>
+#  endif
+
+namespace tracy
+{
+
+#  if defined _WIN32 || defined __CYGWIN__
+
+static inline uint64_t ConvertTime( const FILETIME& t )
+{
+    return ( uint64_t( t.dwHighDateTime ) << 32 ) | uint64_t( t.dwLowDateTime );
+}
+
+void SysTime::ReadTimes()
+{
+    FILETIME idleTime;
+    FILETIME kernelTime;
+    FILETIME userTime;
+
+    GetSystemTimes( &idleTime, &kernelTime, &userTime );
+
+    idle = ConvertTime( idleTime );
+    const auto kernel = ConvertTime( kernelTime );
+    const auto user = ConvertTime( userTime );
+    used = kernel + user;
+}
+
+#  elif defined __linux__
+
+void SysTime::ReadTimes()
+{
+    uint64_t user, nice, system;
+    FILE* f = fopen( "/proc/stat", "r" );
+    if( f )
+    {
+        fscanf( f, "cpu %" PRIu64 " %" PRIu64 " %" PRIu64" %" PRIu64, &user, &nice, &system, &idle );
+        fclose( f );
+        used = user + nice + system;
+    }
+}
+
+#  elif defined __APPLE__
+
+void SysTime::ReadTimes()
+{
+    host_cpu_load_info_data_t info;
+    mach_msg_type_number_t cnt = HOST_CPU_LOAD_INFO_COUNT;
+    host_statistics( mach_host_self(), HOST_CPU_LOAD_INFO, reinterpret_cast<host_info_t>( &info ), &cnt ); 
+    used = info.cpu_ticks[CPU_STATE_USER] + info.cpu_ticks[CPU_STATE_NICE] + info.cpu_ticks[CPU_STATE_SYSTEM];
+    idle = info.cpu_ticks[CPU_STATE_IDLE];
+}
+
+#  elif defined BSD
+
+void SysTime::ReadTimes()
+{
+    u_long data[5];
+    size_t sz = sizeof( data );
+    sysctlbyname( "kern.cp_time", &data, &sz, nullptr, 0 );
+    used = data[0] + data[1] + data[2] + data[3];
+    idle = data[4];
+}
+
+#endif
+
+SysTime::SysTime()
+{
+    ReadTimes();
+}
+
+float SysTime::Get()
+{
+    const auto oldUsed = used;
+    const auto oldIdle = idle;
+
+    ReadTimes();
+
+    const auto diffIdle = idle - oldIdle;
+    const auto diffUsed = used - oldUsed;
+
+#if defined _WIN32 || defined __CYGWIN__
+    return diffUsed == 0 ? -1 : ( diffUsed - diffIdle ) * 100.f / diffUsed;
+#elif defined __linux__ || defined __APPLE__ || defined BSD
+    const auto total = diffUsed + diffIdle;
+    return total == 0 ? -1 : diffUsed * 100.f / total;
+#endif
+}
+
+}
+
+#endif
diff --git a/libs/tracy/client/TracySysTime.hpp b/libs/tracy/client/TracySysTime.hpp
@@ -0,0 +1,36 @@
+#ifndef __TRACYSYSTIME_HPP__
+#define __TRACYSYSTIME_HPP__
+
+#if defined _WIN32 || defined __CYGWIN__ || defined __linux__ || defined __APPLE__
+#  define TRACY_HAS_SYSTIME
+#else
+#  include <sys/param.h>
+#endif
+
+#ifdef BSD
+#  define TRACY_HAS_SYSTIME
+#endif
+
+#ifdef TRACY_HAS_SYSTIME
+
+#include <stdint.h>
+
+namespace tracy
+{
+
+class SysTime
+{
+public:
+    SysTime();
+    float Get();
+
+    void ReadTimes();
+
+private:
+    uint64_t idle, used;
+};
+
+}
+#endif
+
+#endif
diff --git a/libs/tracy/client/TracySysTrace.cpp b/libs/tracy/client/TracySysTrace.cpp
@@ -0,0 +1,862 @@
+#include "TracySysTrace.hpp"
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+
+#  if defined _WIN32 || defined __CYGWIN__
+
+#    ifndef NOMINMAX
+#      define NOMINMAX
+#    endif
+
+#    define INITGUID
+#    include <assert.h>
+#    include <string.h>
+#    include <windows.h>
+#    include <dbghelp.h>
+#    include <evntrace.h>
+#    include <evntcons.h>
+#    include <psapi.h>
+#    include <winternl.h>
+
+#    include "../common/TracyAlloc.hpp"
+#    include "../common/TracySystem.hpp"
+#    include "TracyProfiler.hpp"
+
+namespace tracy
+{
+
+TRACEHANDLE s_traceHandle;
+TRACEHANDLE s_traceHandle2;
+EVENT_TRACE_PROPERTIES* s_prop;
+
+struct CSwitch
+{
+    uint32_t    newThreadId;
+    uint32_t    oldThreadId;
+    int8_t      newThreadPriority;
+    int8_t      oldThreadPriority;
+    uint8_t     previousCState;
+    int8_t      spareByte;
+    int8_t      oldThreadWaitReason;
+    int8_t      oldThreadWaitMode;
+    int8_t      oldThreadState;
+    int8_t      oldThreadWaitIdealProcessor;
+    uint32_t    newThreadWaitTime;
+    uint32_t    reserved;
+};
+
+struct ReadyThread
+{
+    uint32_t    threadId;
+    int8_t      adjustReason;
+    int8_t      adjustIncrement;
+    int8_t      flag;
+    int8_t      reserverd;
+};
+
+void WINAPI EventRecordCallback( PEVENT_RECORD record )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+
+    const auto& hdr = record->EventHeader;
+    if( hdr.EventDescriptor.Opcode == 36 )
+    {
+        const auto cswitch = (const CSwitch*)record->UserData;
+
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::ContextSwitch );
+        MemWrite( &item->contextSwitch.time, hdr.TimeStamp.QuadPart );
+        memcpy( &item->contextSwitch.oldThread, &cswitch->oldThreadId, sizeof( cswitch->oldThreadId ) );
+        memcpy( &item->contextSwitch.newThread, &cswitch->newThreadId, sizeof( cswitch->newThreadId ) );
+        memset( ((char*)&item->contextSwitch.oldThread)+4, 0, 4 );
+        memset( ((char*)&item->contextSwitch.newThread)+4, 0, 4 );
+        MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber );
+        MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason );
+        MemWrite( &item->contextSwitch.state, cswitch->oldThreadState );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+    else if( hdr.EventDescriptor.Opcode == 50 )
+    {
+        const auto rt = (const ReadyThread*)record->UserData;
+
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::ThreadWakeup );
+        MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart );
+        memcpy( &item->threadWakeup.thread, &rt->threadId, sizeof( rt->threadId ) );
+        memset( ((char*)&item->threadWakeup.thread)+4, 0, 4 );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+}
+
+bool SysTraceStart()
+{
+    TOKEN_PRIVILEGES priv = {};
+    priv.PrivilegeCount = 1;
+    priv.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+    if( LookupPrivilegeValue( nullptr, SE_SYSTEM_PROFILE_NAME, &priv.Privileges[0].Luid ) == 0 ) return false;
+
+    HANDLE pt;
+    if( OpenProcessToken( GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &pt ) == 0 ) return false;
+    const auto adjust = AdjustTokenPrivileges( pt, FALSE, &priv, 0, nullptr, nullptr );
+    CloseHandle( pt );
+    if( adjust == 0 ) return false;
+    const auto status = GetLastError();
+    if( status != ERROR_SUCCESS ) return false;
+
+    const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + sizeof( KERNEL_LOGGER_NAME );
+    s_prop = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz );
+    memset( s_prop, 0, sizeof( EVENT_TRACE_PROPERTIES ) );
+    s_prop->EnableFlags = EVENT_TRACE_FLAG_CSWITCH | EVENT_TRACE_FLAG_DISPATCHER;
+    s_prop->LogFileMode = EVENT_TRACE_REAL_TIME_MODE;
+    s_prop->Wnode.BufferSize = psz;
+    s_prop->Wnode.Flags = WNODE_FLAG_TRACED_GUID;
+    s_prop->Wnode.ClientContext = 3;
+    s_prop->Wnode.Guid = SystemTraceControlGuid;
+    s_prop->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES );
+    memcpy( ((char*)s_prop) + sizeof( EVENT_TRACE_PROPERTIES ), KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) );
+
+    auto backup = tracy_malloc( psz );
+    memcpy( backup, s_prop, psz );
+
+    const auto controlStatus = ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP );
+    if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND )
+    {
+        tracy_free( s_prop );
+        return false;
+    }
+
+    memcpy( s_prop, backup, psz );
+    tracy_free( backup );
+
+    const auto startStatus = StartTrace( &s_traceHandle, KERNEL_LOGGER_NAME, s_prop );
+    if( startStatus != ERROR_SUCCESS )
+    {
+        tracy_free( s_prop );
+        return false;
+    }
+
+#ifdef UNICODE
+    WCHAR KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )];
+#else
+    char KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )];
+#endif
+    memcpy( KernelLoggerName, KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) );
+    EVENT_TRACE_LOGFILE log = {};
+    log.LoggerName = KernelLoggerName;
+    log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP;
+    log.EventRecordCallback = EventRecordCallback;
+
+    s_traceHandle2 = OpenTrace( &log );
+    if( s_traceHandle2 == (TRACEHANDLE)INVALID_HANDLE_VALUE )
+    {
+        CloseTrace( s_traceHandle );
+        tracy_free( s_prop );
+        return false;
+    }
+
+    return true;
+}
+
+void SysTraceStop()
+{
+    CloseTrace( s_traceHandle2 );
+    CloseTrace( s_traceHandle );
+}
+
+void SysTraceWorker( void* ptr )
+{
+    SetThreadName( "Tracy SysTrace" );
+    ProcessTrace( &s_traceHandle2, 1, 0, 0 );
+    ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP );
+    tracy_free( s_prop );
+}
+
+#ifdef __CYGWIN__
+extern "C" typedef DWORD (WINAPI *t_GetProcessIdOfThread)( HANDLE );
+extern "C" typedef DWORD (WINAPI *t_GetProcessImageFileNameA)( HANDLE, LPSTR, DWORD );
+#  ifdef UNICODE
+t_GetProcessIdOfThread GetProcessIdOfThread = (t_GetProcessIdOfThread)GetProcAddress( GetModuleHandle( L"kernel32.dll" ), "GetProcessIdOfThread" );
+t_GetProcessImageFileNameA GetProcessImageFileNameA = (t_GetProcessImageFileNameA)GetProcAddress( GetModuleHandle( L"kernel32.dll" ), "K32GetProcessImageFileNameA" );
+#  else
+t_GetProcessIdOfThread GetProcessIdOfThread = (t_GetProcessIdOfThread)GetProcAddress( GetModuleHandle( "kernel32.dll" ), "GetProcessIdOfThread" );
+t_GetProcessImageFileNameA GetProcessImageFileNameA = (t_GetProcessImageFileNameA)GetProcAddress( GetModuleHandle( "kernel32.dll" ), "K32GetProcessImageFileNameA" );
+#  endif
+#endif
+
+extern "C" typedef NTSTATUS (WINAPI *t_NtQueryInformationThread)( HANDLE, THREADINFOCLASS, PVOID, ULONG, PULONG );
+extern "C" typedef BOOL (WINAPI *t_EnumProcessModules)( HANDLE, HMODULE*, DWORD, LPDWORD );
+extern "C" typedef BOOL (WINAPI *t_GetModuleInformation)( HANDLE, HMODULE, LPMODULEINFO, DWORD );
+extern "C" typedef DWORD (WINAPI *t_GetModuleBaseNameA)( HANDLE, HMODULE, LPSTR, DWORD );
+#ifdef UNICODE
+t_NtQueryInformationThread NtQueryInformationThread = (t_NtQueryInformationThread)GetProcAddress( GetModuleHandle( L"ntdll.dll" ), "NtQueryInformationThread" );
+t_EnumProcessModules _EnumProcessModules = (t_EnumProcessModules)GetProcAddress( GetModuleHandle( L"kernel32.dll" ), "K32EnumProcessModules" );
+t_GetModuleInformation _GetModuleInformation = (t_GetModuleInformation)GetProcAddress( GetModuleHandle( L"kernel32.dll" ), "K32GetModuleInformation" );
+t_GetModuleBaseNameA _GetModuleBaseNameA = (t_GetModuleBaseNameA)GetProcAddress( GetModuleHandle( L"kernel32.dll" ), "K32GetModuleBaseNameA" );
+#else
+t_NtQueryInformationThread NtQueryInformationThread = (t_NtQueryInformationThread)GetProcAddress( GetModuleHandle( "ntdll.dll" ), "NtQueryInformationThread" );
+t_EnumProcessModules _EnumProcessModules = (t_EnumProcessModules)GetProcAddress( GetModuleHandle( "kernel32.dll" ), "K32EnumProcessModules" );
+t_GetModuleInformation _GetModuleInformation = (t_GetModuleInformation)GetProcAddress( GetModuleHandle( "kernel32.dll" ), "K32GetModuleInformation" );
+t_GetModuleBaseNameA _GetModuleBaseNameA = (t_GetModuleBaseNameA)GetProcAddress( GetModuleHandle( "kernel32.dll" ), "K32GetModuleBaseNameA" );
+#endif
+
+
+void SysTraceSendExternalName( uint64_t thread )
+{
+    bool threadSent = false;
+    auto hnd = OpenThread( THREAD_QUERY_INFORMATION, FALSE, DWORD( thread ) );
+    if( hnd == 0 )
+    {
+        hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, DWORD( thread ) );
+    }
+    if( hnd != 0 )
+    {
+#if defined NTDDI_WIN10_RS2 && NTDDI_VERSION >= NTDDI_WIN10_RS2
+        PWSTR tmp;
+        GetThreadDescription( hnd, &tmp );
+        char buf[256];
+        if( tmp )
+        {
+            auto ret = wcstombs( buf, tmp, 256 );
+            if( ret != 0 )
+            {
+                GetProfiler().SendString( thread, buf, QueueType::ExternalThreadName );
+                threadSent = true;
+            }
+        }
+#endif
+        const auto pid = GetProcessIdOfThread( hnd );
+        if( !threadSent && NtQueryInformationThread && _EnumProcessModules && _GetModuleInformation && _GetModuleBaseNameA )
+        {
+            void* ptr;
+            ULONG retlen;
+            auto status = NtQueryInformationThread( hnd, (THREADINFOCLASS)9 /*ThreadQuerySetWin32StartAddress*/, &ptr, sizeof( &ptr ), &retlen );
+            if( status == 0 )
+            {
+                const auto phnd = OpenProcess( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, pid );
+                if( phnd != INVALID_HANDLE_VALUE )
+                {
+                    HMODULE modules[1024];
+                    DWORD needed;
+                    if( _EnumProcessModules( phnd, modules, 1024 * sizeof( HMODULE ), &needed ) != 0 )
+                    {
+                        const auto sz = std::min( DWORD( needed / sizeof( HMODULE ) ), DWORD( 1024 ) );
+                        for( DWORD i=0; i<sz; i++ )
+                        {
+                            MODULEINFO info;
+                            if( _GetModuleInformation( phnd, modules[i], &info, sizeof( info ) ) != 0 )
+                            {
+                                if( (uint64_t)ptr >= (uint64_t)info.lpBaseOfDll && (uint64_t)ptr <= (uint64_t)info.lpBaseOfDll + (uint64_t)info.SizeOfImage )
+                                {
+                                    char buf[1024];
+                                    if( _GetModuleBaseNameA( phnd, modules[i], buf, 1024 ) != 0 )
+                                    {
+                                        GetProfiler().SendString( thread, buf, QueueType::ExternalThreadName );
+                                        threadSent = true;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    CloseHandle( phnd );
+                }
+            }
+        }
+        CloseHandle( hnd );
+        if( !threadSent )
+        {
+            GetProfiler().SendString( thread, "???", QueueType::ExternalThreadName );
+            threadSent = true;
+        }
+        if( pid != 0 )
+        {
+            {
+                uint64_t _pid = pid;
+                Magic magic;
+                auto token = GetToken();
+                auto& tail = token->get_tail_index();
+                auto item = token->enqueue_begin( magic );
+                MemWrite( &item->hdr.type, QueueType::TidToPid );
+                MemWrite( &item->tidToPid.tid, thread );
+                MemWrite( &item->tidToPid.pid, _pid );
+                tail.store( magic + 1, std::memory_order_release );
+            }
+            if( pid == 4 )
+            {
+                GetProfiler().SendString( thread, "System", QueueType::ExternalName );
+                return;
+            }
+            else
+            {
+                const auto phnd = OpenProcess( PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid );
+                if( phnd != INVALID_HANDLE_VALUE )
+                {
+                    char buf[1024];
+                    const auto sz = GetProcessImageFileNameA( phnd, buf, 1024 );
+                    CloseHandle( phnd );
+                    if( sz != 0 )
+                    {
+                        auto ptr = buf + sz - 1;
+                        while( ptr > buf && *ptr != '\\' ) ptr--;
+                        if( *ptr == '\\' ) ptr++;
+                        GetProfiler().SendString( thread, ptr, QueueType::ExternalName );
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    if( !threadSent )
+    {
+        GetProfiler().SendString( thread, "???", QueueType::ExternalThreadName );
+    }
+    GetProfiler().SendString( thread, "???", QueueType::ExternalName );
+}
+
+}
+
+#  elif defined __linux__
+
+#    include <sys/types.h>
+#    include <sys/stat.h>
+#    include <sys/wait.h>
+#    include <fcntl.h>
+#    include <inttypes.h>
+#    include <limits>
+#    include <poll.h>
+#    include <stdio.h>
+#    include <stdlib.h>
+#    include <string.h>
+#    include <unistd.h>
+
+#    include "TracyProfiler.hpp"
+
+#    ifdef __ANDROID__
+#      include "TracySysTracePayload.hpp"
+#    endif
+
+namespace tracy
+{
+
+static const char BasePath[] = "/sys/kernel/debug/tracing/";
+static const char TracingOn[] = "tracing_on";
+static const char CurrentTracer[] = "current_tracer";
+static const char TraceOptions[] = "trace_options";
+static const char TraceClock[] = "trace_clock";
+static const char SchedSwitch[] = "events/sched/sched_switch/enable";
+static const char SchedWakeup[] = "events/sched/sched_wakeup/enable";
+static const char BufferSizeKb[] = "buffer_size_kb";
+static const char TracePipe[] = "trace_pipe";
+
+#ifdef __ANDROID__
+static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz )
+{
+    char tmp[256];
+    sprintf( tmp, "su -c 'echo \"%s\" > %s%s'", val, BasePath, path );
+    return system( tmp ) == 0;
+}
+#else
+static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz )
+{
+    char tmp[256];
+    memcpy( tmp, BasePath, sizeof( BasePath ) - 1 );
+    memcpy( tmp + sizeof( BasePath ) - 1, path, psz );
+
+    int fd = open( tmp, O_WRONLY );
+    if( fd < 0 ) return false;
+
+    for(;;)
+    {
+        ssize_t cnt = write( fd, val, vsz );
+        if( cnt == (ssize_t)vsz )
+        {
+            close( fd );
+            return true;
+        }
+        if( cnt < 0 )
+        {
+            close( fd );
+            return false;
+        }
+        vsz -= cnt;
+        val += cnt;
+    }
+}
+#endif
+
+#ifdef __ANDROID__
+void SysTraceInjectPayload()
+{
+    int pipefd[2];
+    if( pipe( pipefd ) == 0 )
+    {
+        const auto pid = fork();
+        if( pid == 0 )
+        {
+            // child
+            close( pipefd[1] );
+            if( dup2( pipefd[0], STDIN_FILENO ) >= 0 )
+            {
+                close( pipefd[0] );
+                execlp( "su", "su", "-c", "cat > /data/tracy_systrace", (char*)nullptr );
+                exit( 1 );
+            }
+        }
+        else if( pid > 0 )
+        {
+            // parent
+            close( pipefd[0] );
+
+#ifdef __aarch64__
+            write( pipefd[1], tracy_systrace_aarch64_data, tracy_systrace_aarch64_size );
+#else
+            write( pipefd[1], tracy_systrace_armv7_data, tracy_systrace_armv7_size );
+#endif
+            close( pipefd[1] );
+            waitpid( pid, nullptr, 0 );
+
+            system( "su -c 'chmod 700 /data/tracy_systrace'" );
+        }
+    }
+}
+#endif
+
+bool SysTraceStart()
+{
+    if( !TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 ) ) return false;
+    if( !TraceWrite( CurrentTracer, sizeof( CurrentTracer ), "nop", 4 ) ) return false;
+    TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-cmd", 13 );
+    TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-tgid", 14 );
+    TraceWrite( TraceOptions, sizeof( TraceOptions ), "noirq-info", 11 );
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+    if( !TraceWrite( TraceClock, sizeof( TraceClock ), "x86-tsc", 8 ) ) return false;
+#elif __ARM_ARCH >= 6
+    if( !TraceWrite( TraceClock, sizeof( TraceClock ), "mono_raw", 9 ) ) return false;
+#endif
+    if( !TraceWrite( SchedSwitch, sizeof( SchedSwitch ), "1", 2 ) ) return false;
+    if( !TraceWrite( SchedWakeup, sizeof( SchedWakeup ), "1", 2 ) ) return false;
+    if( !TraceWrite( BufferSizeKb, sizeof( BufferSizeKb ), "512", 4 ) ) return false;
+
+#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH )
+    SysTraceInjectPayload();
+#endif
+
+    if( !TraceWrite( TracingOn, sizeof( TracingOn ), "1", 2 ) ) return false;
+
+    return true;
+}
+
+void SysTraceStop()
+{
+    TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 );
+}
+
+static uint64_t ReadNumber( const char*& ptr )
+{
+    uint64_t val = 0;
+    for(;;)
+    {
+        if( *ptr >= '0' && *ptr <= '9' )
+        {
+            val = val * 10 + ( *ptr - '0' );
+            ptr++;
+        }
+        else
+        {
+            return val;
+        }
+    }
+}
+
+static uint8_t ReadState( char state )
+{
+    switch( state )
+    {
+    case 'D': return 101;
+    case 'I': return 102;
+    case 'R': return 103;
+    case 'S': return 104;
+    case 'T': return 105;
+    case 't': return 106;
+    case 'W': return 107;
+    case 'X': return 108;
+    case 'Z': return 109;
+    default: return 100;
+    }
+}
+
+#if defined __ANDROID__ && defined __ANDROID_API__ && __ANDROID_API__ < 18
+/*-
+ * Copyright (c) 2011 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Christos Zoulas.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ssize_t getdelim(char **buf, size_t *bufsiz, int delimiter, FILE *fp)
+{
+	char *ptr, *eptr;
+
+	if (*buf == NULL || *bufsiz == 0) {
+		*bufsiz = BUFSIZ;
+		if ((*buf = (char*)malloc(*bufsiz)) == NULL)
+			return -1;
+	}
+
+	for (ptr = *buf, eptr = *buf + *bufsiz;;) {
+		int c = fgetc(fp);
+		if (c == -1) {
+			if (feof(fp))
+				return ptr == *buf ? -1 : ptr - *buf;
+			else
+				return -1;
+		}
+		*ptr++ = c;
+		if (c == delimiter) {
+			*ptr = '\0';
+			return ptr - *buf;
+		}
+		if (ptr + 2 >= eptr) {
+			char *nbuf;
+			size_t nbufsiz = *bufsiz * 2;
+			ssize_t d = ptr - *buf;
+			if ((nbuf = (char*)realloc(*buf, nbufsiz)) == NULL)
+				return -1;
+			*buf = nbuf;
+			*bufsiz = nbufsiz;
+			eptr = nbuf + nbufsiz;
+			ptr = nbuf + d;
+		}
+	}
+}
+
+ssize_t getline(char **buf, size_t *bufsiz, FILE *fp)
+{
+	return getdelim(buf, bufsiz, '\n', fp);
+}
+#endif
+
+static void HandleTraceLine( const char* line )
+{
+    line += 24;
+    const auto cpu = (uint8_t)ReadNumber( line );
+
+    line++;      // ']'
+    while( *line == ' ' ) line++;
+
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+    const auto time = ReadNumber( line );
+#elif __ARM_ARCH >= 6
+    const auto ts = ReadNumber( line );
+    line++;      // '.'
+    const auto tus = ReadNumber( line );
+    const auto time = ts * 1000000000ll + tus * 1000ll;
+#endif
+
+    line += 2;   // ': '
+    if( memcmp( line, "sched_switch", 12 ) == 0 )
+    {
+        line += 14;
+
+        while( memcmp( line, "prev_pid", 8 ) != 0 ) line++;
+        line += 9;
+
+        const auto oldPid = ReadNumber( line );
+        line++;
+
+        while( memcmp( line, "prev_state", 10 ) != 0 ) line++;
+        line += 11;
+
+        const auto oldState = (uint8_t)ReadState( *line );
+        line += 5;
+
+        while( memcmp( line, "next_pid", 8 ) != 0 ) line++;
+        line += 9;
+
+        const auto newPid = ReadNumber( line );
+
+        uint8_t reason = 100;
+
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::ContextSwitch );
+        MemWrite( &item->contextSwitch.time, time );
+        MemWrite( &item->contextSwitch.oldThread, oldPid );
+        MemWrite( &item->contextSwitch.newThread, newPid );
+        MemWrite( &item->contextSwitch.cpu, cpu );
+        MemWrite( &item->contextSwitch.reason, reason );
+        MemWrite( &item->contextSwitch.state, oldState );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+    else if( memcmp( line, "sched_wakeup", 12 ) == 0 )
+    {
+        line += 14;
+
+        while( memcmp( line, "pid", 3 ) != 0 ) line++;
+        line += 4;
+
+        const auto pid = ReadNumber( line );
+
+        Magic magic;
+        auto token = GetToken();
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin( magic );
+        MemWrite( &item->hdr.type, QueueType::ThreadWakeup );
+        MemWrite( &item->threadWakeup.time, time );
+        MemWrite( &item->threadWakeup.thread, pid );
+        tail.store( magic + 1, std::memory_order_release );
+    }
+}
+
+#ifdef __ANDROID__
+static void ProcessTraceLines( int fd )
+{
+    // Linux pipe buffer is 64KB, additional 1KB is for unfinished lines
+    char* buf = (char*)tracy_malloc( (64+1)*1024 );
+    char* line = buf;
+
+    for(;;)
+    {
+        const auto rd = read( fd, line, 64*1024 );
+        if( rd <= 0 ) break;
+
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() )
+        {
+            if( rd < 64*1024 )
+            {
+                assert( line[rd-1] == '\n' );
+                line = buf;
+                std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+            }
+            else
+            {
+                const auto end = line + rd;
+                line = end - 1;
+                while( line > buf && *line != '\n' ) line--;
+                if( line > buf )
+                {
+                    line++;
+                    const auto lsz = end - line;
+                    memmove( buf, line, lsz );
+                    line = buf + lsz;
+                }
+            }
+            continue;
+        }
+#endif
+
+        const auto end = line + rd;
+        line = buf;
+        for(;;)
+        {
+            auto next = line;
+            while( next < end && *next != '\n' ) next++;
+            next++;
+            if( next >= end )
+            {
+                const auto lsz = end - line;
+                memmove( buf, line, lsz );
+                line = buf + lsz;
+                break;
+            }
+
+            HandleTraceLine( line );
+            line = next;
+        }
+        if( rd < 64*1024 )
+        {
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+        }
+    }
+
+    tracy_free( buf );
+}
+
+void SysTraceWorker( void* ptr )
+{
+    SetThreadName( "Tracy SysTrace" );
+    int pipefd[2];
+    if( pipe( pipefd ) == 0 )
+    {
+        const auto pid = fork();
+        if( pid == 0 )
+        {
+            // child
+            close( pipefd[0] );
+            dup2( pipefd[1], STDERR_FILENO );
+            if( dup2( pipefd[1], STDOUT_FILENO ) >= 0 )
+            {
+                close( pipefd[1] );
+#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH )
+                execlp( "su", "su", "-c", "/data/tracy_systrace", (char*)nullptr );
+#endif
+                execlp( "su", "su", "-c", "cat /sys/kernel/debug/tracing/trace_pipe", (char*)nullptr );
+                exit( 1 );
+            }
+        }
+        else if( pid > 0 )
+        {
+            // parent
+            close( pipefd[1] );
+            ProcessTraceLines( pipefd[0] );
+            close( pipefd[0] );
+        }
+    }
+}
+#else
+static void ProcessTraceLines( int fd )
+{
+    char* buf = (char*)tracy_malloc( 64*1024 );
+
+    struct pollfd pfd;
+    pfd.fd = fd;
+    pfd.events = POLLIN | POLLERR;
+
+    for(;;)
+    {
+        while( poll( &pfd, 1, 0 ) <= 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
+        const auto rd = read( fd, buf, 64*1024 );
+        if( rd <= 0 ) break;
+
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) continue;
+#endif
+
+        auto line = buf;
+        const auto end = buf + rd;
+        for(;;)
+        {
+            auto next = line;
+            while( next < end && *next != '\n' ) next++;
+            if( next == end ) break;
+            assert( *next == '\n' );
+            next++;
+
+            HandleTraceLine( line );
+            line = next;
+        }
+    }
+
+    tracy_free( buf );
+}
+
+void SysTraceWorker( void* ptr )
+{
+    SetThreadName( "Tracy SysTrace" );
+    char tmp[256];
+    memcpy( tmp, BasePath, sizeof( BasePath ) - 1 );
+    memcpy( tmp + sizeof( BasePath ) - 1, TracePipe, sizeof( TracePipe ) );
+
+    int fd = open( tmp, O_RDONLY );
+    if( fd < 0 ) return;
+    ProcessTraceLines( fd );
+    close( fd );
+}
+#endif
+
+void SysTraceSendExternalName( uint64_t thread )
+{
+    FILE* f;
+    char fn[256];
+    sprintf( fn, "/proc/%" PRIu64 "/comm", thread );
+    f = fopen( fn, "rb" );
+    if( f )
+    {
+        char buf[256];
+        const auto sz = fread( buf, 1, 256, f );
+        if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
+        GetProfiler().SendString( thread, buf, QueueType::ExternalThreadName );
+        fclose( f );
+    }
+    else
+    {
+        GetProfiler().SendString( thread, "???", QueueType::ExternalThreadName );
+    }
+
+    sprintf( fn, "/proc/%" PRIu64 "/status", thread );
+    f = fopen( fn, "rb" );
+    if( f )
+    {
+        int pid = -1;
+        size_t lsz = 1024;
+        auto line = (char*)malloc( lsz );
+        for(;;)
+        {
+            auto rd = getline( &line, &lsz, f );
+            if( rd <= 0 ) break;
+            if( memcmp( "Tgid:\t", line, 6 ) == 0 )
+            {
+                pid = atoi( line + 6 );
+                break;
+            }
+        }
+        free( line );
+        fclose( f );
+        if( pid >= 0 )
+        {
+            {
+                uint64_t _pid = pid;
+                Magic magic;
+                auto token = GetToken();
+                auto& tail = token->get_tail_index();
+                auto item = token->enqueue_begin( magic );
+                MemWrite( &item->hdr.type, QueueType::TidToPid );
+                MemWrite( &item->tidToPid.tid, thread );
+                MemWrite( &item->tidToPid.pid, _pid );
+                tail.store( magic + 1, std::memory_order_release );
+            }
+            sprintf( fn, "/proc/%i/comm", pid );
+            f = fopen( fn, "rb" );
+            if( f )
+            {
+                char buf[256];
+                const auto sz = fread( buf, 1, 256, f );
+                if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
+                GetProfiler().SendString( thread, buf, QueueType::ExternalName );
+                fclose( f );
+                return;
+            }
+        }
+    }
+    GetProfiler().SendString( thread, "???", QueueType::ExternalName );
+}
+
+}
+
+#  endif
+
+#endif
diff --git a/libs/tracy/client/TracySysTrace.hpp b/libs/tracy/client/TracySysTrace.hpp
@@ -0,0 +1,25 @@
+#ifndef __TRACYSYSTRACE_HPP__
+#define __TRACYSYSTRACE_HPP__
+
+#if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __CYGWIN__ || defined __linux__ )
+#  define TRACY_HAS_SYSTEM_TRACING
+#endif
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+
+#include <stdint.h>
+
+namespace tracy
+{
+
+bool SysTraceStart();
+void SysTraceStop();
+void SysTraceWorker( void* ptr );
+
+void SysTraceSendExternalName( uint64_t thread );
+
+}
+
+#endif
+
+#endif
diff --git a/libs/tracy/client/TracySysTracePayload.hpp b/libs/tracy/client/TracySysTracePayload.hpp
@@ -0,0 +1,80 @@
+// File: '/home/wolf/desktop/tracy_systrace.armv7' (1210 bytes)`
+// File: '/home/wolf/desktop/tracy_systrace.aarch64' (1650 bytes)
+
+// Exported using binary_to_compressed_c.cpp
+
+namespace tracy
+{
+
+static const unsigned int tracy_systrace_armv7_size = 1210;
+static const unsigned int tracy_systrace_armv7_data[1212/4] =
+{
+    0x464c457f, 0x00010101, 0x00000000, 0x00000000, 0x00280003, 0x00000001, 0x00000208, 0x00000034, 0x00000000, 0x05000200, 0x00200034, 0x00280007, 
+    0x00000000, 0x00000006, 0x00000034, 0x00000034, 0x00000034, 0x000000e0, 0x000000e0, 0x00000004, 0x00000004, 0x00000003, 0x00000114, 0x00000114, 
+    0x00000114, 0x00000013, 0x00000013, 0x00000004, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x000003ed, 0x000003ed, 0x00000005, 
+    0x00001000, 0x00000001, 0x000003ed, 0x000013ed, 0x000013ed, 0x000000cd, 0x000000cf, 0x00000006, 0x00001000, 0x00000002, 0x000003f0, 0x000013f0, 
+    0x000013f0, 0x000000b8, 0x000000b8, 0x00000006, 0x00000004, 0x6474e551, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000006, 
+    0x00000010, 0x70000001, 0x00000394, 0x00000394, 0x00000394, 0x00000008, 0x00000008, 0x00000004, 0x00000004, 0x7379732f, 0x2f6d6574, 0x2f6e6962, 
+    0x6b6e696c, 0x00007265, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000012, 0x00000016, 0x00000000, 
+    0x00000000, 0x00000012, 0x6f6c6400, 0x006e6570, 0x4342494c, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x006d7973, 0x00000001, 0x00000003, 0x00000001, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000003, 0x00000002, 0x00000000, 0x00000000, 0x00000001, 0x00020000, 0x00000002, 0x00010001, 
+    0x0000000d, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000008, 0x00000000, 0x000014b4, 0x00000116, 0x000014b8, 0x00000216, 0xe52de004, 
+    0xe59fe004, 0xe08fe00e, 0xe5bef008, 0x000012bc, 0xe28fc600, 0xe28cca01, 0xe5bcf2bc, 0xe28fc600, 0xe28cca01, 0xe5bcf2b4, 0xe92d4ff0, 0xe28db01c, 
+    0xe24dd01c, 0xe24dd801, 0xe59f0154, 0xe3a01001, 0xe08f0000, 0xebfffff1, 0xe59f1148, 0xe1a07000, 0xe08f1001, 0xebfffff0, 0xe59f113c, 0xe1a09000, 
+    0xe1a00007, 0xe08f1001, 0xebffffeb, 0xe59f112c, 0xe1a04000, 0xe1a00007, 0xe08f1001, 0xebffffe6, 0xe59f111c, 0xe1a05000, 0xe1a00007, 0xe08f1001, 
+    0xebffffe1, 0xe59f110c, 0xe1a06000, 0xe1a00007, 0xe08f1001, 0xebffffdc, 0xe58d0004, 0xe1a00007, 0xe59f10f4, 0xe08f1001, 0xebffffd7, 0xe1a0a000, 
+    0xe59f00e8, 0xe3a01000, 0xe3a08000, 0xe08f0000, 0xe12fff39, 0xe1a07000, 0xe3700001, 0xca000001, 0xe3a00000, 0xe12fff34, 0xe3a00009, 0xe58d4000, 
+    0xe1cd01b4, 0xe3090680, 0xe3400098, 0xe28d4010, 0xe58d000c, 0xe28d9018, 0xe58d8008, 0xe28d8008, 0xe58d7010, 0xea000003, 0xe1a02000, 0xe3a00001, 
+    0xe1a01009, 0xe12fff3a, 0xe1a00004, 0xe3a01001, 0xe3a02000, 0xe12fff35, 0xe3500000, 0xca000008, 0xe1a00008, 0xe3a01000, 0xe12fff36, 0xe1a00004, 
+    0xe3a01001, 0xe3a02000, 0xe12fff35, 0xe3500001, 0xbafffff6, 0xe59d3004, 0xe1a00007, 0xe1a01009, 0xe3a02801, 0xe12fff33, 0xe3500001, 0xaaffffe5, 
+    0xe59d1000, 0xe3a00000, 0xe12fff31, 0xe24bd01c, 0xe8bd8ff0, 0x00000174, 0x0000016c, 0x0000015d, 0x0000014e, 0x0000013f, 0x00000135, 0x00000126, 
+    0x00000114, 0x7ffffe74, 0x00000001, 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, 0x6e006c6c, 0x736f6e61, 0x7065656c, 0x61657200, 
+    0x72770064, 0x00657469, 0x7379732f, 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, 0x72742f67, 0x5f656361, 0x65706970, 0x00000000, 
+    0x00000003, 0x000014a8, 0x00000002, 0x00000010, 0x00000017, 0x000001cc, 0x00000014, 0x00000011, 0x00000015, 0x00000000, 0x00000006, 0x00000128, 
+    0x0000000b, 0x00000010, 0x00000005, 0x00000158, 0x0000000a, 0x0000001c, 0x6ffffef5, 0x00000174, 0x00000004, 0x0000018c, 0x00000001, 0x0000000d, 
+    0x0000001e, 0x00000008, 0x6ffffffb, 0x00000001, 0x6ffffff0, 0x000001a4, 0x6ffffffe, 0x000001ac, 0x6fffffff, 0x00000001, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x000001dc, 0x000001dc, 
+};
+
+static const unsigned int tracy_systrace_aarch64_size = 1650;
+static const unsigned int tracy_systrace_aarch64_data[1652/4] =
+{
+    0x464c457f, 0x00010102, 0x00000000, 0x00000000, 0x00b70003, 0x00000001, 0x00000300, 0x00000000, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00380040, 0x00400006, 0x00000000, 0x00000006, 0x00000005, 0x00000040, 0x00000000, 0x00000040, 0x00000000, 0x00000040, 0x00000000, 
+    0x00000150, 0x00000000, 0x00000150, 0x00000000, 0x00000008, 0x00000000, 0x00000003, 0x00000004, 0x00000190, 0x00000000, 0x00000190, 0x00000000, 
+    0x00000190, 0x00000000, 0x00000015, 0x00000000, 0x00000015, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000005, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000004d1, 0x00000000, 0x000004d1, 0x00000000, 0x00010000, 0x00000000, 0x00000001, 0x00000006, 
+    0x000004d8, 0x00000000, 0x000104d8, 0x00000000, 0x000104d8, 0x00000000, 0x0000019a, 0x00000000, 0x000001a0, 0x00000000, 0x00010000, 0x00000000, 
+    0x00000002, 0x00000006, 0x000004d8, 0x00000000, 0x000104d8, 0x00000000, 0x000104d8, 0x00000000, 0x00000170, 0x00000000, 0x00000170, 0x00000000, 
+    0x00000008, 0x00000000, 0x6474e551, 0x00000006, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000010, 0x00000000, 0x7379732f, 0x2f6d6574, 0x2f6e6962, 0x6b6e696c, 0x34367265, 0x00000000, 0x00000001, 0x00000004, 
+    0x00000003, 0x00000000, 0x00000000, 0x00000000, 0x00000002, 0x00000000, 0x00000001, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000a0003, 0x00000300, 0x00000000, 
+    0x00000000, 0x00000000, 0x0000000a, 0x00000012, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000011, 0x00000012, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x6e65706f, 0x736c6400, 0x4c006d79, 0x00434249, 0x00000000, 0x00020002, 0x00000000, 
+    0x00010001, 0x00000001, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000017, 0x00000000, 0x00010668, 0x00000000, 0x00000402, 0x00000002, 
+    0x00000000, 0x00000000, 0x00010670, 0x00000000, 0x00000402, 0x00000003, 0x00000000, 0x00000000, 0xa9bf7bf0, 0x90000090, 0xf9433211, 0x91198210, 
+    0xd61f0220, 0xd503201f, 0xd503201f, 0xd503201f, 0x90000090, 0xf9433611, 0x9119a210, 0xd61f0220, 0x90000090, 0xf9433a11, 0x9119c210, 0xd61f0220, 
+    0xf81b0ffc, 0xa9015ff8, 0xa90257f6, 0xa9034ff4, 0xa9047bfd, 0x910103fd, 0xd14043ff, 0xd10043ff, 0x90000000, 0x91120000, 0x320003e1, 0x97ffffed, 
+    0x90000001, 0x91122021, 0xaa0003f7, 0x97ffffed, 0x90000001, 0xaa0003f8, 0x91123421, 0xaa1703e0, 0x97ffffe8, 0x90000001, 0xaa0003f3, 0x91124821, 
+    0xaa1703e0, 0x97ffffe3, 0x90000001, 0xaa0003f4, 0x91125c21, 0xaa1703e0, 0x97ffffde, 0x90000001, 0xaa0003f5, 0x91128421, 0xaa1703e0, 0x97ffffd9, 
+    0x90000001, 0xaa0003f6, 0x91129821, 0xaa1703e0, 0x97ffffd4, 0xaa0003f7, 0x90000000, 0x9112b000, 0x2a1f03e1, 0xd63f0300, 0x2a0003f8, 0x36f80060, 
+    0x2a1f03e0, 0xd63f0260, 0x90000008, 0x3dc11d00, 0x52800128, 0xb81c83b8, 0x781cc3a8, 0x3d8003e0, 0x14000005, 0x93407c02, 0x320003e0, 0x910043e1, 
+    0xd63f02e0, 0xd100e3a0, 0x320003e1, 0x2a1f03e2, 0xd63f0280, 0x7100001f, 0x5400014c, 0x910003e0, 0xaa1f03e1, 0xd63f02a0, 0xd100e3a0, 0x320003e1, 
+    0x2a1f03e2, 0xd63f0280, 0x7100041f, 0x54ffff0b, 0x910043e1, 0x321003e2, 0x2a1803e0, 0xd63f02c0, 0x7100041f, 0x54fffd0a, 0x2a1f03e0, 0xd63f0260, 
+    0x914043ff, 0x910043ff, 0xa9447bfd, 0xa9434ff4, 0xa94257f6, 0xa9415ff8, 0xf84507fc, 0xd65f03c0, 0x00000000, 0x00000000, 0x00989680, 0x00000000, 
+    0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, 0x6e006c6c, 0x736f6e61, 0x7065656c, 0x61657200, 0x72770064, 0x00657469, 0x7379732f, 
+    0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, 0x72742f67, 0x5f656361, 0x65706970, 0x00000000, 0x00000000, 0x00000001, 0x00000000, 
+    0x00000001, 0x00000000, 0x00000004, 0x00000000, 0x000001a8, 0x00000000, 0x6ffffef5, 0x00000000, 0x000001c8, 0x00000000, 0x00000005, 0x00000000, 
+    0x00000248, 0x00000000, 0x00000006, 0x00000000, 0x000001e8, 0x00000000, 0x0000000a, 0x00000000, 0x0000001c, 0x00000000, 0x0000000b, 0x00000000, 
+    0x00000018, 0x00000000, 0x00000015, 0x00000000, 0x00000000, 0x00000000, 0x00000003, 0x00000000, 0x00010650, 0x00000000, 0x00000002, 0x00000000, 
+    0x00000030, 0x00000000, 0x00000014, 0x00000000, 0x00000007, 0x00000000, 0x00000017, 0x00000000, 0x00000290, 0x00000000, 0x0000001e, 0x00000000, 
+    0x00000008, 0x00000000, 0x6ffffffb, 0x00000000, 0x00000001, 0x00000000, 0x6ffffffe, 0x00000000, 0x00000270, 0x00000000, 0x6fffffff, 0x00000000, 
+    0x00000001, 0x00000000, 0x6ffffff0, 0x00000000, 0x00000264, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000104d8, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x000002c0, 0x00000000, 0x000002c0, 
+};
+
+}
diff --git a/libs/tracy/client/TracyThread.hpp b/libs/tracy/client/TracyThread.hpp
@@ -0,0 +1,70 @@
+#ifndef __TRACYTHREAD_HPP__
+#define __TRACYTHREAD_HPP__
+
+#if defined _WIN32 || defined __CYGWIN__
+#  include <windows.h>
+#else
+#  include <pthread.h>
+#endif
+
+namespace tracy
+{
+
+#if defined _WIN32 || defined __CYGWIN__
+
+class Thread
+{
+public:
+    Thread( void(*func)( void* ptr ), void* ptr )
+        : m_func( func )
+        , m_ptr( ptr )
+        , m_hnd( CreateThread( nullptr, 0, Launch, this, 0, nullptr ) )
+    {}
+
+    ~Thread()
+    {
+        WaitForSingleObject( m_hnd, INFINITE );
+        CloseHandle( m_hnd );
+    }
+
+    HANDLE Handle() const { return m_hnd; }
+
+private:
+    static DWORD WINAPI Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return 0; }
+
+    void(*m_func)( void* ptr );
+    void* m_ptr;
+    HANDLE m_hnd;
+};
+
+#else
+
+class Thread
+{
+public:
+    Thread( void(*func)( void* ptr ), void* ptr )
+        : m_func( func )
+        , m_ptr( ptr )
+    {
+        pthread_create( &m_thread, nullptr, Launch, this );
+    }
+
+    ~Thread()
+    {
+        pthread_join( m_thread, nullptr );
+    }
+
+    pthread_t Handle() const { return m_thread; }
+
+private:
+    static void* Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return nullptr; }
+    void(*m_func)( void* ptr );
+    void* m_ptr;
+    pthread_t m_thread;
+};
+
+#endif
+
+}
+
+#endif
diff --git a/libs/tracy/client/tracy_concurrentqueue.h b/libs/tracy/client/tracy_concurrentqueue.h
@@ -0,0 +1,1552 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2016, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#pragma once
+
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracyForceInline.hpp"
+#include "../common/TracySystem.hpp"
+
+#if defined(__GNUC__)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+
+namespace tracy
+{
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
+#else
+#define MOODYCAMEL_TRY if (true)
+#define MOODYCAMEL_CATCH(...) else if (false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions. 
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	inline bool cqLikely(bool x) { return __builtin_expect((x), true); }
+	inline bool cqUnlikely(bool x) { return __builtin_expect((x), false); }
+#else
+	inline bool cqLikely(bool x) { return x; }
+	inline bool cqUnlikely(bool x) { return x; }
+#endif
+} }
+
+namespace
+{
+    // to avoid MSVC warning 4127: conditional expression is constant
+    template <bool>
+    struct compile_time_condition
+    {
+        static const bool value = false;
+    };
+    template <>
+    struct compile_time_condition<true>
+    {
+        static const bool value = true;
+    };
+}
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GLIBCXX__)
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+	
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+	
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 128;
+	
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+	
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+	
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+	
+	
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return tracy::tracy_malloc(size); }
+	static inline void free(void* ptr) { return tracy::tracy_free(ptr); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+        uint64_t threadId;
+		
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr), threadId(0)
+		{
+		}
+	};
+	
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4554)
+#endif
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << static_cast<T>(sizeof(T) * CHAR_BIT - 1));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+	}
+	
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = std::move(left.load(std::memory_order_relaxed));
+		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+		right.store(std::move(temp), std::memory_order_relaxed);
+	}
+	
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+	
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+	
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+	
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+	
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+	
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+	
+	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+	
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+	
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+	
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+	
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+	
+	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+	
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+	
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+    struct ExplicitProducer;
+
+	typedef moodycamel::ProducerToken producer_token_t;
+	typedef moodycamel::ConsumerToken consumer_token_t;
+	
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+	
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+	}
+	
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers);
+		populate_initial_block_list(blocks);
+	}
+	
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+		
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+		
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+    ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_DELETE_FUNCTION;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+    ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_DELETE_FUNCTION;
+	
+public:
+    tracy_force_inline T* enqueue_begin(producer_token_t const& token, index_t& currentTailIndex)
+    {
+        return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::enqueue_begin(currentTailIndex);
+    }
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return 0;
+			}
+		}
+		
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+	
+    template<typename It>
+    size_t try_dequeue_bulk_single(consumer_token_t& token, It itemFirst, size_t max, uint64_t& threadId )
+    {
+        if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+            if (!update_current_producer_after_rotation(token)) {
+                return 0;
+            }
+        }
+
+        size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+        if (count == max) {
+            if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+                globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+            }
+            threadId = token.currentProducer->threadId;
+            return max;
+        }
+        token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+        if (ptr == nullptr) {
+            ptr = tail;
+        }
+        if( count == 0 )
+        {
+            while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+                auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+                if (dequeued != 0) {
+                    threadId = ptr->threadId;
+                    token.currentProducer = ptr;
+                    token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+                    return dequeued;
+                }
+                ptr = ptr->next_prod();
+                if (ptr == nullptr) {
+                    ptr = tail;
+                }
+            }
+            return 0;
+        }
+        else
+        {
+            threadId = token.currentProducer->threadId;
+            token.currentProducer = ptr;
+            token.itemsConsumedFromCurrent = 0;
+            return count;
+        }
+    }
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	friend struct ExplicitProducer;
+	
+	
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+	
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if (details::cqUnlikely(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+		
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+		
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+	
+	
+	///////////////////////////
+	// Free list
+	///////////////////////////
+	
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+		
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+	
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+		
+		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		inline void add(N* node)
+		{
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+		
+		inline N* try_get()
+		{
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+				
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+					
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+				
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+			
+			return nullptr;
+		}
+		
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+		
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+		
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+	
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+	};
+	
+	
+	///////////////////////////
+	// Block
+	///////////////////////////
+	
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true)
+		{
+		}
+		
+		inline bool is_empty() const
+		{
+			if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+				
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+		
+		// Returns true if the block is now empty (does not apply in explicit context)
+		inline bool set_empty(index_t i)
+		{
+			if (BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+		
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		inline bool set_many_empty(index_t i, size_t count)
+		{
+			if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+		
+		inline void set_all_empty()
+		{
+			if (BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+		
+		inline void reset_empty()
+		{
+			if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+		
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		
+	private:
+		// IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of
+		// addresses returned by malloc, that alignment will be preserved. Apparently clang actually
+		// generates code that uses this assumption for AVX instructions in some cases. Ideally, we
+		// should also align Block to the alignment of T in case it's higher than malloc's 16-byte
+		// alignment, but this is hard to do in a cross-platform way. Assert for this case:
+		static_assert(std::alignment_of<T>::value <= std::alignment_of<details::max_align_t>::value, "The queue does not support super-aligned types at this time");
+		// Additionally, we need the alignment of Block itself to be a multiple of max_align_t since
+		// otherwise the appropriate padding will not be added at the end of Block in order to make
+		// arrays of Blocks all be properly aligned (not just the first one). We use a union to force
+		// this.
+		union {
+			char elements[sizeof(T) * BLOCK_SIZE];
+			details::max_align_t dummy;
+		};
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		std::atomic<bool> shouldBeOnFreeList;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<details::max_align_t>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+	
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			parent(parent_)
+		{
+		}
+		
+		virtual ~ProducerBase() { };
+		
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+		}
+		
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+		
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+		
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+		
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+		
+		Block* tailBlock;
+		
+	public:
+		ConcurrentQueue* parent;
+	};
+	
+	
+    public:
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* _parent) :
+			ProducerBase(_parent),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(_parent->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+			
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+		
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+				
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::is_empty()) {
+						continue;
+					}
+					
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+					
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					if (block->dynamicallyAllocated) {
+						destroy(block);
+					}
+					else {
+						this->parent->add_block_to_free_list(block);
+					}
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+		
+        inline void enqueue_begin_alloc(index_t currentTailIndex)
+        {
+            // We reached the end of a block, start a new one
+            if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::is_empty()) {
+                // We can re-use the block ahead of us, it's empty!					
+                this->tailBlock = this->tailBlock->next;
+                this->tailBlock->ConcurrentQueue::Block::reset_empty();
+
+                // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+                // last block from it first -- except instead of removing then adding, we can just overwrite).
+                // Note that there must be a valid block index here, since even if allocation failed in the ctor,
+                // it would have been re-attempted when adding the first block to the queue; since there is such
+                // a block, a block index must have been successfully allocated.
+            }
+            else {
+                // We're going to need a new block; check that the block index has room
+                if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+                    // Hmm, the circular block index is already full -- we'll need
+                    // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+                    // the initial allocation failed in the constructor.
+                    new_block_index(pr_blockIndexSlotsUsed);
+                }
+
+                // Insert a new block in the circular linked list
+                auto newBlock = this->parent->ConcurrentQueue::requisition_block();
+                newBlock->ConcurrentQueue::Block::reset_empty();
+                if (this->tailBlock == nullptr) {
+                    newBlock->next = newBlock;
+                }
+                else {
+                    newBlock->next = this->tailBlock->next;
+                    this->tailBlock->next = newBlock;
+                }
+                this->tailBlock = newBlock;
+                ++pr_blockIndexSlotsUsed;
+            }
+
+            // Add block to block index
+            auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+            entry.base = currentTailIndex;
+            entry.block = this->tailBlock;
+            blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+            pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        tracy_force_inline T* enqueue_begin(index_t& currentTailIndex)
+        {
+            currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+            if (details::cqUnlikely((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)) {
+                this->enqueue_begin_alloc(currentTailIndex);
+            }
+            return (*this->tailBlock)[currentTailIndex];
+        }
+
+        tracy_force_inline std::atomic<index_t>& get_tail_index()
+        {
+            return this->tailIndex;
+        }
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				assert(overcommit <= myDequeueCount);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE);
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+
+                        const auto sz = endIndex - index;
+                        memcpy( itemFirst, (*block)[index], sizeof( T ) * sz );
+                        index += sz;
+                        itemFirst += sz;
+
+						block->ConcurrentQueue::Block::set_many_empty(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+		
+		
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+			
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+			
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+			
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+			
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+			
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+			
+			return true;
+		}
+		
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+		
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+	};
+	
+    ExplicitProducer* get_explicit_producer(producer_token_t const& token)
+    {
+        return static_cast<ExplicitProducer*>(token.producer);
+    }
+
+    private:
+	
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+	
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+		
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+	
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+		
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+		
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+	
+	inline void add_block_to_free_list(Block* block)
+	{
+		freeList.add(block);
+	}
+	
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+	
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+	
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		return create<Block>();
+	}
+	
+	
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////	
+	
+	ProducerBase* recycle_or_create_producer()
+	{
+		bool recycled;
+		return recycle_or_create_producer(recycled);
+	}
+	
+    ProducerBase* recycle_or_create_producer(bool& recycled)
+    {
+        // Try to re-use one first
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            if (ptr->inactive.load(std::memory_order_relaxed)) {
+                if( ptr->size_approx() == 0 )
+                {
+                    bool expected = true;
+                    if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+                        // We caught one! It's been marked as activated, the caller can have it
+                        recycled = true;
+                        return ptr;
+                    }
+                }
+            }
+        }
+
+        recycled = false;
+        return add_producer(static_cast<ProducerBase*>(create<ExplicitProducer>(this)));
+    }
+	
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+		
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+		
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+		
+		return producer;
+	}
+	
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+	
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+	
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		return static_cast<U*>((Traits::malloc)(sizeof(U) * count));
+	}
+	
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		((void)count);
+		if (p != nullptr) {
+			assert(count > 0);
+			(Traits::free)(p);
+		}
+	}
+	
+	template<typename U>
+	static inline U* create()
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+	
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+	
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+	
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+	
+	FreeList<Block> freeList;
+	
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer())
+{
+	if (producer != nullptr) {
+		producer->token = this;
+        producer->threadId = detail::GetThreadHandleImpl();
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}
+
+} /* namespace tracy */
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/libs/tracy/client/tracy_rpmalloc.cpp b/libs/tracy/client/tracy_rpmalloc.cpp
@@ -0,0 +1,2099 @@
+#ifdef TRACY_ENABLE
+
+/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson / Rampant Pixels
+ *
+ * This library provides a cross-platform lock free thread caching malloc implementation in C11.
+ * The latest source code is always available at
+ *
+ * https://github.com/rampantpixels/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
+ *
+ */
+
+#include "tracy_rpmalloc.hpp"
+
+/// Build time configurable limits
+#ifndef HEAP_ARRAY_SIZE
+//! Size of heap hashmap
+#define HEAP_ARRAY_SIZE           79
+#endif
+#ifndef ENABLE_THREAD_CACHE
+//! Enable per-thread cache
+#define ENABLE_THREAD_CACHE       1
+#endif
+#ifndef ENABLE_GLOBAL_CACHE
+//! Enable global cache shared between all threads, requires thread cache
+#define ENABLE_GLOBAL_CACHE       1
+#endif
+#ifndef ENABLE_VALIDATE_ARGS
+//! Enable validation of args to public entry points
+#define ENABLE_VALIDATE_ARGS      0
+#endif
+#ifndef ENABLE_STATISTICS
+//! Enable statistics collection
+#define ENABLE_STATISTICS         0
+#endif
+#ifndef ENABLE_ASSERTS
+//! Enable asserts
+#define ENABLE_ASSERTS            0
+#endif
+#ifndef ENABLE_PRELOAD
+//! Support preloading
+#define ENABLE_PRELOAD            0
+#endif
+#ifndef ENABLE_GUARDS
+//! Enable overwrite/underwrite guards
+#define ENABLE_GUARDS             0
+#endif
+#ifndef ENABLE_UNLIMITED_CACHE
+//! Unlimited cache disables any cache limitations
+#define ENABLE_UNLIMITED_CACHE    0
+#endif
+#ifndef DEFAULT_SPAN_MAP_COUNT
+//! Default number of spans to map in call to map more virtual memory
+#define DEFAULT_SPAN_MAP_COUNT    16
+#endif
+//! Minimum cache size to remain after a release to global cache
+#define MIN_SPAN_CACHE_SIZE 64
+//! Minimum number of spans to transfer between thread and global cache
+#define MIN_SPAN_CACHE_RELEASE 16
+//! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor)
+#define MAX_SPAN_CACHE_DIVISOR 4
+//! Minimum cache size to remain after a release to global cache, large spans
+#define MIN_LARGE_SPAN_CACHE_SIZE 8
+//! Minimum number of spans to transfer between thread and global cache, large spans
+#define MIN_LARGE_SPAN_CACHE_RELEASE 4
+//! Maximum cache size divisor, large spans (max cache size will be max allocation count divided by this divisor)
+#define MAX_LARGE_SPAN_CACHE_DIVISOR 16
+//! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this)
+#define MAX_GLOBAL_CACHE_MULTIPLIER 8
+
+#if !ENABLE_THREAD_CACHE
+#  undef ENABLE_GLOBAL_CACHE
+#  define ENABLE_GLOBAL_CACHE 0
+#  undef MIN_SPAN_CACHE_SIZE
+#  undef MIN_SPAN_CACHE_RELEASE
+#  undef MAX_SPAN_CACHE_DIVISOR
+#  undef MIN_LARGE_SPAN_CACHE_SIZE
+#  undef MIN_LARGE_SPAN_CACHE_RELEASE
+#  undef MAX_LARGE_SPAN_CACHE_DIVISOR
+#endif
+#if !ENABLE_GLOBAL_CACHE
+#  undef MAX_GLOBAL_CACHE_MULTIPLIER
+#endif
+
+/// Platform and arch specifics
+#ifdef _MSC_VER
+#  pragma warning( push )
+#  pragma warning( disable : 4324 )
+#  define ALIGNED_STRUCT(name, alignment) __declspec(align(alignment)) struct name
+#  define FORCEINLINE __forceinline
+#  define atomic_thread_fence_acquire() //_ReadWriteBarrier()
+#  define atomic_thread_fence_release() //_ReadWriteBarrier()
+#  if ENABLE_VALIDATE_ARGS
+#    include <Intsafe.h>
+#  endif
+#  include <intrin.h>
+#else
+#  include <unistd.h>
+#  if defined(__APPLE__) && ENABLE_PRELOAD
+#    include <pthread.h>
+#  endif
+#  define ALIGNED_STRUCT(name, alignment) struct __attribute__((__aligned__(alignment))) name
+#  ifdef FORCEINLINE
+#    undef FORCEINLINE
+#  endif
+#  define FORCEINLINE inline __attribute__((__always_inline__))
+#  ifdef __arm__
+#    define atomic_thread_fence_acquire() __asm volatile("dmb ish" ::: "memory")
+#    define atomic_thread_fence_release() __asm volatile("dmb ishst" ::: "memory")
+#  else
+#    define atomic_thread_fence_acquire() //__asm volatile("" ::: "memory")
+#    define atomic_thread_fence_release() //__asm volatile("" ::: "memory")
+#  endif
+#endif
+
+#if defined( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( _AMD64_ ) || defined( __arm64__ ) || defined( __aarch64__ )
+#  define ARCH_64BIT 1
+#else
+#  define ARCH_64BIT 0
+#endif
+
+#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
+#  define PLATFORM_WINDOWS 1
+#  define PLATFORM_POSIX 0
+#else
+#  define PLATFORM_WINDOWS 0
+#  define PLATFORM_POSIX 1
+#endif
+
+#include <stdint.h>
+#include <string.h>
+
+#include <assert.h>
+
+#if ENABLE_GUARDS
+#  define MAGIC_GUARD 0xDEADBAAD
+#endif
+
+namespace tracy
+{
+
+/// Atomic access abstraction
+ALIGNED_STRUCT(atomic32_t, 4) {
+	volatile int32_t nonatomic;
+};
+typedef struct atomic32_t atomic32_t;
+
+ALIGNED_STRUCT(atomic64_t, 8) {
+	volatile int64_t nonatomic;
+};
+typedef struct atomic64_t atomic64_t;
+
+ALIGNED_STRUCT(atomicptr_t, 8) {
+	volatile void* nonatomic;
+};
+typedef struct atomicptr_t atomicptr_t;
+
+static FORCEINLINE int32_t
+atomic_load32(atomic32_t* src) {
+	return src->nonatomic;
+}
+
+static FORCEINLINE void
+atomic_store32(atomic32_t* dst, int32_t val) {
+	dst->nonatomic = val;
+}
+
+static FORCEINLINE int32_t
+atomic_incr32(atomic32_t* val) {
+#ifdef _MSC_VER
+	int32_t old = (int32_t)_InterlockedExchangeAdd((volatile long*)&val->nonatomic, 1);
+	return (old + 1);
+#else
+	return __sync_add_and_fetch(&val->nonatomic, 1);
+#endif
+}
+
+static FORCEINLINE int32_t
+atomic_add32(atomic32_t* val, int32_t add) {
+#ifdef _MSC_VER
+	int32_t old = (int32_t)_InterlockedExchangeAdd((volatile long*)&val->nonatomic, add);
+	return (old + add);
+#else
+	return __sync_add_and_fetch(&val->nonatomic, add);
+#endif
+}
+
+static FORCEINLINE void*
+atomic_load_ptr(atomicptr_t* src) {
+	return (void*)((uintptr_t)src->nonatomic);
+}
+
+static FORCEINLINE void
+atomic_store_ptr(atomicptr_t* dst, void* val) {
+	dst->nonatomic = val;
+}
+
+static FORCEINLINE int
+atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) {
+#ifdef _MSC_VER
+#  if ARCH_64BIT
+	return (_InterlockedCompareExchange64((volatile long long*)&dst->nonatomic,
+	                                      (long long)val, (long long)ref) == (long long)ref) ? 1 : 0;
+#  else
+	return (_InterlockedCompareExchange((volatile long*)&dst->nonatomic,
+	                                    (long)val, (long)ref) == (long)ref) ? 1 : 0;
+#  endif
+#else
+	return __sync_bool_compare_and_swap(&dst->nonatomic, ref, val);
+#endif
+}
+
+/// Preconfigured limits and sizes
+//! Granularity of a small allocation block
+#define SMALL_GRANULARITY         32
+//! Small granularity shift count
+#define SMALL_GRANULARITY_SHIFT   5
+//! Number of small block size classes
+#define SMALL_CLASS_COUNT         63
+//! Maximum size of a small block
+#define SMALL_SIZE_LIMIT          2016
+//! Granularity of a medium allocation block
+#define MEDIUM_GRANULARITY        512
+//! Medium granularity shift count
+#define MEDIUM_GRANULARITY_SHIFT  9
+//! Number of medium block size classes
+#define MEDIUM_CLASS_COUNT        60
+//! Total number of small + medium size classes
+#define SIZE_CLASS_COUNT          (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
+//! Number of large block size classes
+#define LARGE_CLASS_COUNT         32
+//! Maximum size of a medium block
+#define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT) - SPAN_HEADER_SIZE)
+//! Maximum size of a large block
+#define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
+//! Size of a span header
+#define SPAN_HEADER_SIZE          32
+
+#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs))
+#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))
+
+#if ARCH_64BIT
+typedef int64_t offset_t;
+#else
+typedef int32_t offset_t;
+#endif
+typedef uint32_t count_t;
+
+#if ENABLE_VALIDATE_ARGS
+//! Maximum allocation size to avoid integer overflow
+#undef  MAX_ALLOC_SIZE
+#define MAX_ALLOC_SIZE            (((size_t)-1) - _memory_span_size)
+#endif
+
+/// Data types
+//! A memory heap, per thread
+typedef struct heap_t heap_t;
+//! Span of memory pages
+typedef struct span_t span_t;
+//! Size class definition
+typedef struct size_class_t size_class_t;
+//! Span block bookkeeping
+typedef struct span_block_t span_block_t;
+//! Span list bookkeeping
+typedef struct span_list_t span_list_t;
+//! Span data union, usage depending on span state
+typedef union span_data_t span_data_t;
+//! Cache data
+typedef struct span_counter_t span_counter_t;
+//! Global cache
+typedef struct global_cache_t global_cache_t;
+
+//! Flag indicating span is the first (master) span of a split superspan
+#define SPAN_FLAG_MASTER 1
+//! Flag indicating span is a secondary (sub) span of a split superspan
+#define SPAN_FLAG_SUBSPAN 2
+
+//Alignment offset must match in both structures to keep the data when
+//transitioning between being used for blocks and being part of a list
+struct span_block_t {
+	//! Free list
+	uint16_t    free_list;
+	//! First autolinked block
+	uint16_t    first_autolink;
+	//! Free count
+	uint16_t    free_count;
+	//! Alignment offset
+	uint16_t    align_offset;
+};
+
+struct span_list_t {
+	//! List size
+	uint32_t    size;
+	//! Unused in lists
+	uint16_t    unused;
+	//! Alignment offset
+	uint16_t    align_offset;
+};
+
+union span_data_t {
+	//! Span data when used as blocks
+	span_block_t block;
+	//! Span data when used in lists
+	span_list_t list;
+};
+
+//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
+//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
+//span or a super span. A super span can further be diviced into multiple spans (or this, super spans), where the first
+//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
+//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
+//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
+//in the same call to release the virtual memory range, but individual subranges can be decommitted individually
+//to reduce physical memory use).
+struct span_t {
+	//!	Heap ID
+	atomic32_t  heap_id;
+	//! Size class
+	uint16_t    size_class;
+	// TODO: If we could store remainder part of flags as an atomic counter, the entire check
+	//       if master is owned by calling heap could be simplified to an atomic dec from any thread
+	//       since remainder of a split super span only ever decreases, never increases
+	//! Flags and counters
+	uint16_t    flags;
+	//! Span data
+	span_data_t data;
+	//! Next span
+	span_t*     next_span;
+	//! Previous span
+	span_t*     prev_span;
+};
+static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
+
+//Adaptive cache counter of a single superspan span count
+struct span_counter_t {
+	//! Allocation high water mark
+	uint32_t  max_allocations;
+	//! Current number of allocations
+	uint32_t  current_allocations;
+	//! Cache limit
+	uint32_t  cache_limit;
+};
+
+struct heap_t {
+	//! Heap ID
+	int32_t      id;
+	//! Free count for each size class active span
+	span_block_t active_block[SIZE_CLASS_COUNT];
+	//! Active span for each size class
+	span_t*      active_span[SIZE_CLASS_COUNT];
+	//! List of semi-used spans with free blocks for each size class (double linked list)
+	span_t*      size_cache[SIZE_CLASS_COUNT];
+#if ENABLE_THREAD_CACHE
+	//! List of free spans (single linked list)
+	span_t*      span_cache[LARGE_CLASS_COUNT];
+	//! Allocation counters
+	span_counter_t span_counter[LARGE_CLASS_COUNT];
+#endif
+	//! Mapped but unused spans
+	span_t*      span_reserve;
+	//! Master span for mapped but unused spans
+	span_t*      span_reserve_master;
+	//! Number of mapped but unused spans
+	size_t       spans_reserved;
+	//! Deferred deallocation
+	atomicptr_t  defer_deallocate;
+	//! Deferred unmaps
+	atomicptr_t  defer_unmap;
+	//! Next heap in id list
+	heap_t*      next_heap;
+	//! Next heap in orphan list
+	heap_t*      next_orphan;
+	//! Memory pages alignment offset
+	size_t       align_offset;
+#if ENABLE_STATISTICS
+	//! Number of bytes transitioned thread -> global
+	size_t       thread_to_global;
+	//! Number of bytes transitioned global -> thread
+	size_t       global_to_thread;
+#endif
+};
+
+struct size_class_t {
+	//! Size of blocks in this class
+	uint32_t size;
+	//! Number of blocks in each chunk
+	uint16_t block_count;
+	//! Class index this class is merged with
+	uint16_t class_idx;
+};
+static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
+
+struct global_cache_t {
+	//! Cache list pointer
+	atomicptr_t cache;
+	//! Cache size
+	atomic32_t size;
+	//! ABA counter
+	atomic32_t counter;
+};
+
+/// Global data
+//! Configuration
+static rpmalloc_config_t _memory_config;
+//! Memory page size
+static size_t _memory_page_size;
+//! Shift to divide by page size
+static size_t _memory_page_size_shift;
+//! Mask to get to start of a memory page
+static size_t _memory_page_mask;
+//! Granularity at which memory pages are mapped by OS
+static size_t _memory_map_granularity;
+//! Size of a span of memory pages
+static size_t _memory_span_size;
+//! Shift to divide by span size
+static size_t _memory_span_size_shift;
+//! Mask to get to start of a memory span
+static uintptr_t _memory_span_mask;
+//! Global size classes
+static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
+//! Run-time size limit of medium blocks
+static size_t _memory_medium_size_limit;
+//! Heap ID counter
+static atomic32_t _memory_heap_id;
+#if ENABLE_THREAD_CACHE
+//! Adaptive cache max allocation count
+static uint32_t _memory_max_allocation[LARGE_CLASS_COUNT];
+#endif
+#if ENABLE_GLOBAL_CACHE
+//! Global span cache
+static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
+#endif
+//! All heaps
+static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
+//! Orphaned heaps
+static atomicptr_t _memory_orphan_heaps;
+//! Running orphan counter to avoid ABA issues in linked list
+static atomic32_t _memory_orphan_counter;
+//! Active heap count
+static atomic32_t _memory_active_heaps;
+#if ENABLE_STATISTICS
+//! Total number of currently mapped memory pages
+static atomic32_t _mapped_pages;
+//! Total number of currently lost spans
+static atomic32_t _reserved_spans;
+//! Running counter of total number of mapped memory pages since start
+static atomic32_t _mapped_total;
+//! Running counter of total number of unmapped memory pages since start
+static atomic32_t _unmapped_total;
+#endif
+
+#define MEMORY_UNUSED(x) (void)sizeof((x))
+
+//! Current thread heap
+#if defined(__APPLE__) && ENABLE_PRELOAD
+static pthread_key_t _memory_thread_heap;
+#else
+#  ifdef _MSC_VER
+#    define _Thread_local __declspec(thread)
+#    define TLS_MODEL
+#  else
+#    define TLS_MODEL __attribute__((tls_model("initial-exec")))
+#    if !defined(__clang__) && defined(__GNUC__)
+#      define _Thread_local __thread
+#    endif
+#  endif
+static _Thread_local heap_t* _memory_thread_heap TLS_MODEL;
+#endif
+
+//! Get the current thread heap
+static FORCEINLINE heap_t*
+get_thread_heap(void) {
+#if defined(__APPLE__) && ENABLE_PRELOAD
+	return pthread_getspecific(_memory_thread_heap);
+#else
+	return _memory_thread_heap;
+#endif
+}
+
+//! Set the current thread heap
+static void
+set_thread_heap(heap_t* heap) {
+#if defined(__APPLE__) && ENABLE_PRELOAD
+	pthread_setspecific(_memory_thread_heap, heap);
+#else
+	_memory_thread_heap = heap;
+#endif
+}
+
+//! Default implementation to map more virtual memory
+static void*
+_memory_map_os(size_t size, size_t* offset);
+
+//! Default implementation to unmap virtual memory
+static void
+_memory_unmap_os(void* address, size_t size, size_t offset, int release);
+
+//! Deallocate any deferred blocks and check for the given size class
+static int
+_memory_deallocate_deferred(heap_t* heap, size_t size_class);
+
+//! Lookup a memory heap from heap ID
+static heap_t*
+_memory_heap_lookup(int32_t id) {
+	uint32_t list_idx = id % HEAP_ARRAY_SIZE;
+	heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+	while (heap && (heap->id != id))
+		heap = heap->next_heap;
+	return heap;
+}
+
+#if ENABLE_THREAD_CACHE
+
+//! Increase an allocation counter
+static void
+_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size_t span_count) {
+	if (++counter->current_allocations > counter->max_allocations) {
+		counter->max_allocations = counter->current_allocations;
+		const uint32_t cache_limit_max = (uint32_t)_memory_span_size - 2;
+#if !ENABLE_UNLIMITED_CACHE
+		counter->cache_limit = counter->max_allocations / ((span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : MAX_LARGE_SPAN_CACHE_DIVISOR);
+		const uint32_t cache_limit_min = (span_count == 1) ? (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE) : (MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE);
+		if (counter->cache_limit < cache_limit_min)
+			counter->cache_limit = cache_limit_min;
+		if (counter->cache_limit > cache_limit_max)
+			counter->cache_limit = cache_limit_max;
+#else
+		counter->cache_limit = cache_limit_max;
+#endif
+		if (counter->max_allocations > *global_counter)
+			*global_counter = counter->max_allocations;
+	}
+}
+
+#else
+#  define _memory_counter_increase(counter, global_counter, span_count) do {} while (0)
+#endif
+
+#if ENABLE_STATISTICS
+#  define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value))
+#  define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value))
+#else
+#  define _memory_statistics_add(atomic_counter, value) do {} while(0)
+#  define _memory_statistics_sub(atomic_counter, value) do {} while(0)
+#endif
+
+//! Map more virtual memory
+static void*
+_memory_map(size_t size, size_t* offset) {
+	assert(!(size % _memory_page_size));
+	_memory_statistics_add(&_mapped_pages, (size >> _memory_page_size_shift));
+	_memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift));
+	return _memory_config.memory_map(size, offset);
+}
+
+//! Unmap virtual memory
+static void
+_memory_unmap(void* address, size_t size, size_t offset, int release) {
+	assert((size < _memory_span_size) || !((uintptr_t)address & ~_memory_span_mask));
+	assert(!(size % _memory_page_size));
+	_memory_statistics_sub(&_mapped_pages, (size >> _memory_page_size_shift));
+	_memory_statistics_add(&_unmapped_total, (size >> _memory_page_size_shift));
+	_memory_config.memory_unmap(address, size, offset, release);
+}
+
+//! Make flags field in a span from flags, remainder/distance and count
+#define SPAN_MAKE_FLAGS(flags, remdist, count) ((uint16_t)((flags) | ((uint16_t)((remdist) - 1) << 2) | ((uint16_t)((count) - 1) << 9))); assert((flags) < 4); assert((remdist) && (remdist) < 128); assert((count) && (count) < 128)
+//! Check if span has any of the given flags
+#define SPAN_HAS_FLAG(flags, flag) ((flags) & (flag))
+//! Get the distance from flags field
+#define SPAN_DISTANCE(flags) (1 + (((flags) >> 2) & 0x7f))
+//! Get the remainder from flags field
+#define SPAN_REMAINS(flags) (1 + (((flags) >> 2) & 0x7f))
+//! Get the count from flags field
+#define SPAN_COUNT(flags) (1 + (((flags) >> 9) & 0x7f))
+//! Set the remainder in the flags field (MUST be done from the owner heap thread)
+#define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)(((flags) & 0xfe03) | ((uint16_t)((remains) - 1) << 2))); assert((remains) < 128)
+
+//! Resize the given super span to the given count of spans, store the remainder in the heap reserved spans fields
+static void
+_memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_count) {
+	size_t current_count = SPAN_COUNT(span->flags);
+
+	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
+	assert((current_count > 1) && (current_count < 127));
+	assert(!heap->spans_reserved);
+	assert((size_t)SPAN_COUNT(span->flags) == current_count);
+	assert(current_count > use_count);
+
+	heap->span_reserve = (span_t*)pointer_offset(span, use_count * _memory_span_size);
+	heap->spans_reserved = current_count - use_count;
+	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
+		//We must store the heap id before setting as master, to force unmaps to defer to this heap thread
+		atomic_store32(&span->heap_id, heap->id);
+		atomic_thread_fence_release();
+		heap->span_reserve_master = span;
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
+		_memory_statistics_add(&_reserved_spans, current_count);
+	}
+	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
+		//Only owner heap thread can modify a master span
+		assert(atomic_load32(&span->heap_id) == heap->id);
+		uint16_t remains = SPAN_REMAINS(span->flags);
+		assert(remains >= current_count);
+		heap->span_reserve_master = span;
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count);
+	}
+	else { //SPAN_FLAG_SUBSPAN
+		//Resizing a subspan is a safe operation in any thread
+		uint16_t distance = SPAN_DISTANCE(span->flags);
+		span_t* master = (span_t*)pointer_offset(span, -(int)distance * (int)_memory_span_size);
+		heap->span_reserve_master = master;
+		assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER));
+		assert((size_t)SPAN_REMAINS(master->flags) >= current_count);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count);
+	}
+	assert((SPAN_COUNT(span->flags) + heap->spans_reserved) == current_count);
+}
+
+//! Map in memory pages for the given number of spans (or use previously reserved pages)
+static span_t*
+_memory_map_spans(heap_t* heap, size_t span_count) {
+	if (span_count <= heap->spans_reserved) {
+		span_t* span = heap->span_reserve;
+		heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+		heap->spans_reserved -= span_count;
+		//Declare the span to be a subspan with given distance from master span
+		uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) >> _memory_span_size_shift);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count);
+		span->data.block.align_offset = 0;
+		return span;
+	}
+
+	//We cannot request extra spans if we already have some (but not enough) pending reserved spans
+	size_t request_spans = (heap->spans_reserved || (span_count > _memory_config.span_map_count)) ? span_count : _memory_config.span_map_count;
+	size_t align_offset = 0;
+	span_t* span = (span_t*)_memory_map(request_spans * _memory_span_size, &align_offset);
+	span->flags = SPAN_MAKE_FLAGS(0, request_spans, request_spans);
+	span->data.block.align_offset = (uint16_t)align_offset;
+	if (request_spans > span_count) {
+		//We have extra spans, store them as reserved spans in heap
+		_memory_set_span_remainder_as_reserved(heap, span, span_count);
+	}
+	return span;
+}
+
+//! Defer unmapping of the given span to the owner heap
+static int
+_memory_unmap_defer(int32_t heap_id, span_t* span) {
+	//Get the heap and link in pointer in list of deferred operations
+	heap_t* heap = _memory_heap_lookup(heap_id);
+	if (!heap)
+		return 0;
+	atomic_store32(&span->heap_id, heap_id);
+	void* last_ptr;
+	do {
+		last_ptr = atomic_load_ptr(&heap->defer_unmap);
+		span->next_span = (span_t*)last_ptr;
+	} while (!atomic_cas_ptr(&heap->defer_unmap, span, last_ptr));
+	return 1;
+}
+
+//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
+static void
+_memory_unmap_span(heap_t* heap, span_t* span) {
+	size_t span_count = SPAN_COUNT(span->flags);
+	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
+	//A plain run of spans can be unmapped directly
+	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
+		_memory_unmap(span, span_count * _memory_span_size, span->data.list.align_offset, 1);
+		return;
+	}
+
+	uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER);
+	span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size));
+
+	assert(is_master || SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
+	assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER));
+
+	//Check if we own the master span, otherwise defer (only owner of master span can modify remainder field)
+	int32_t master_heap_id = atomic_load32(&master->heap_id);
+	if (heap && (master_heap_id != heap->id)) {
+		if (_memory_unmap_defer(master_heap_id, span))
+			return;
+	}
+	if (!is_master) {
+		//Directly unmap subspans
+		assert(span->data.list.align_offset == 0);
+		_memory_unmap(span, span_count * _memory_span_size, 0, 0);
+		_memory_statistics_sub(&_reserved_spans, span_count);
+	}
+	else {
+		//Special double flag to denote an unmapped master
+		//It must be kept in memory since span header must be used
+		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
+	}
+	//We are in owner thread of the master span
+	uint32_t remains = SPAN_REMAINS(master->flags);
+	assert(remains >= span_count);
+	remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count);
+	if (!remains) {
+		//Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span
+		assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN));
+		span_count = SPAN_COUNT(master->flags);
+		_memory_unmap(master, span_count * _memory_span_size, master->data.list.align_offset, 1);
+		_memory_statistics_sub(&_reserved_spans, span_count);
+	}
+	else {
+		//Set remaining spans
+		SPAN_SET_REMAINS(master->flags, remains);
+	}
+}
+
+//! Process pending deferred cross-thread unmaps
+static span_t*
+_memory_unmap_deferred(heap_t* heap, size_t wanted_count) {
+	//Grab the current list of deferred unmaps
+	atomic_thread_fence_acquire();
+	span_t* span = (span_t*)atomic_load_ptr(&heap->defer_unmap);
+	if (!span || !atomic_cas_ptr(&heap->defer_unmap, 0, span))
+		return 0;
+	span_t* found_span = 0;
+	do {
+		//Verify that we own the master span, otherwise re-defer to owner
+		void* next = span->next_span;
+		size_t span_count = SPAN_COUNT(span->flags);
+		if (!found_span && span_count == wanted_count) {
+			assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
+			found_span = span;
+		}
+		else {
+			uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER);
+			span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size));
+			int32_t master_heap_id = atomic_load32(&master->heap_id);
+			if ((atomic_load32(&span->heap_id) == master_heap_id) ||
+			        !_memory_unmap_defer(master_heap_id, span)) {
+				//We own the master span (or heap merged and abandoned)
+				_memory_unmap_span(heap, span);
+			}
+		}
+		span = (span_t*)next;
+	} while (span);
+	return found_span;
+}
+
+//! Unmap a single linked list of spans
+static void
+_memory_unmap_span_list(heap_t* heap, span_t* span) {
+	size_t list_size = span->data.list.size;
+	for (size_t ispan = 0; ispan < list_size; ++ispan) {
+		span_t* next_span = span->next_span;
+		_memory_unmap_span(heap, span);
+		span = next_span;
+	}
+	assert(!span);
+}
+
+#if ENABLE_THREAD_CACHE
+
+//! Split a super span in two
+static span_t*
+_memory_span_split(heap_t* heap, span_t* span, size_t use_count) {
+	uint16_t distance = 0;
+	size_t current_count = SPAN_COUNT(span->flags);
+	assert(current_count > use_count);
+	assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN));
+	if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) {
+		//Must store heap in master span before use, to avoid issues when unmapping subspans
+		atomic_store32(&span->heap_id, heap->id);
+		atomic_thread_fence_release();
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count);
+		_memory_statistics_add(&_reserved_spans, current_count);
+	}
+	else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) {
+		//Only valid to call on master span if we own it
+		assert(atomic_load32(&span->heap_id) == heap->id);
+		uint16_t remains = SPAN_REMAINS(span->flags);
+		assert(remains >= current_count);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count);
+	}
+	else { //SPAN_FLAG_SUBSPAN
+		distance = SPAN_DISTANCE(span->flags);
+		span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count);
+	}
+	//Setup remainder as a subspan
+	span_t* subspan = (span_t*)pointer_offset(span, use_count * _memory_span_size);
+	subspan->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance + use_count, current_count - use_count);
+	subspan->data.list.align_offset = 0;
+	return subspan;
+}
+
+//! Add span to head of single linked span list
+static size_t
+_memory_span_list_push(span_t** head, span_t* span) {
+	span->next_span = *head;
+	if (*head)
+		span->data.list.size = (*head)->data.list.size + 1;
+	else
+		span->data.list.size = 1;
+	*head = span;
+	return span->data.list.size;
+}
+
+//! Remove span from head of single linked span list, returns the new list head
+static span_t*
+_memory_span_list_pop(span_t** head) {
+	span_t* span = *head;
+	span_t* next_span = 0;
+	if (span->data.list.size > 1) {
+		next_span = span->next_span;
+		assert(next_span);
+		next_span->data.list.size = span->data.list.size - 1;
+	}
+	*head = next_span;
+	return span;
+}
+
+//! Split a single linked span list
+static span_t*
+_memory_span_list_split(span_t* span, size_t limit) {
+	span_t* next = 0;
+	if (limit < 2)
+		limit = 2;
+	if (span->data.list.size > limit) {
+		count_t list_size = 1;
+		span_t* last = span;
+		next = span->next_span;
+		while (list_size < limit) {
+			last = next;
+			next = next->next_span;
+			++list_size;
+		}
+		last->next_span = 0;
+		assert(next);
+		next->data.list.size = span->data.list.size - list_size;
+		span->data.list.size = list_size;
+		span->prev_span = 0;
+	}
+	return next;
+}
+
+#endif
+
+//! Add a span to a double linked list
+static void
+_memory_span_list_doublelink_add(span_t** head, span_t* span) {
+	if (*head) {
+		(*head)->prev_span = span;
+		span->next_span = *head;
+	}
+	else {
+		span->next_span = 0;
+	}
+	*head = span;
+}
+
+//! Remove a span from a double linked list
+static void
+_memory_span_list_doublelink_remove(span_t** head, span_t* span) {
+	if (*head == span) {
+		*head = span->next_span;
+	}
+	else {
+		span_t* next_span = span->next_span;
+		span_t* prev_span = span->prev_span;
+		if (next_span)
+			next_span->prev_span = prev_span;
+		prev_span->next_span = next_span;
+	}
+}
+
+#if ENABLE_GLOBAL_CACHE
+
+//! Insert the given list of memory page spans in the global cache
+static void
+_memory_cache_insert(heap_t* heap, global_cache_t* cache, span_t* span, size_t cache_limit) {
+	assert((span->data.list.size == 1) || (span->next_span != 0));
+	int32_t list_size = (int32_t)span->data.list.size;
+	//Unmap if cache has reached the limit
+	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
+		_memory_unmap_span_list(heap, span);
+		atomic_add32(&cache->size, -list_size);
+		return;
+	}
+	void* current_cache, *new_cache;
+	do {
+		current_cache = atomic_load_ptr(&cache->cache);
+		span->prev_span = (span_t*)(void*)((uintptr_t)current_cache & _memory_span_mask);
+		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
+	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
+}
+
+//! Extract a number of memory page spans from the global cache
+static span_t*
+_memory_cache_extract(global_cache_t* cache) {
+	uintptr_t span_ptr;
+	do {
+		void* global_span = atomic_load_ptr(&cache->cache);
+		span_ptr = (uintptr_t)global_span & _memory_span_mask;
+		if (span_ptr) {
+			span_t* span = (span_t*)(void*)span_ptr;
+			//By accessing the span ptr before it is swapped out of list we assume that a contending thread
+			//does not manage to traverse the span to being unmapped before we access it
+			void* new_cache = (void*)((uintptr_t)span->prev_span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
+			if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) {
+				atomic_add32(&cache->size, -(int32_t)span->data.list.size);
+				return span;
+			}
+		}
+	} while (span_ptr);
+	return 0;
+}
+
+//! Finalize a global cache, only valid from allocator finalization (not thread safe)
+static void
+_memory_cache_finalize(global_cache_t* cache) {
+	void* current_cache = atomic_load_ptr(&cache->cache);
+	span_t* span = (span_t*)(void*)((uintptr_t)current_cache & _memory_span_mask);
+	while (span) {
+		span_t* skip_span = (span_t*)(void*)((uintptr_t)span->prev_span & _memory_span_mask);
+		atomic_add32(&cache->size, -(int32_t)span->data.list.size);
+		_memory_unmap_span_list(0, span);
+		span = skip_span;
+	}
+	assert(!atomic_load32(&cache->size));
+	atomic_store_ptr(&cache->cache, 0);
+	atomic_store32(&cache->size, 0);
+}
+
+//! Insert the given list of memory page spans in the global cache
+static void
+_memory_global_cache_insert(heap_t* heap, span_t* span) {
+	//Calculate adaptive limits
+	size_t span_count = SPAN_COUNT(span->flags);
+	const size_t cache_divisor = (span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2);
+	const size_t cache_limit = (MAX_GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor;
+	const size_t cache_limit_min = MAX_GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE);
+	_memory_cache_insert(heap, &_memory_span_cache[span_count - 1], span, cache_limit > cache_limit_min ? cache_limit : cache_limit_min);
+}
+
+//! Extract a number of memory page spans from the global cache for large blocks
+static span_t*
+_memory_global_cache_extract(size_t span_count) {
+	span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]);
+	assert(!span || ((size_t)SPAN_COUNT(span->flags) == span_count));
+	return span;
+}
+
+#endif
+
+//! Insert a single span into thread heap cache, releasing to global cache if overflow
+static void
+_memory_heap_cache_insert(heap_t* heap, span_t* span) {
+#if ENABLE_THREAD_CACHE
+	size_t span_count = SPAN_COUNT(span->flags);
+	size_t idx = span_count - 1;
+	if (_memory_span_list_push(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit)
+		return;
+	heap->span_cache[idx] = _memory_span_list_split(span, heap->span_counter[idx].cache_limit);
+	assert(span->data.list.size == heap->span_counter[idx].cache_limit);
+#if ENABLE_STATISTICS
+	heap->thread_to_global += (size_t)span->data.list.size * span_count * _memory_span_size;
+#endif
+#if ENABLE_GLOBAL_CACHE
+	_memory_global_cache_insert(heap, span);
+#else
+	_memory_unmap_span_list(heap, span);
+#endif
+#else
+	_memory_unmap_span(heap, span);
+#endif
+}
+
+//! Extract the given number of spans from the different cache levels
+static span_t*
+_memory_heap_cache_extract(heap_t* heap, size_t span_count) {
+#if ENABLE_THREAD_CACHE
+	size_t idx = span_count - 1;
+	//Step 1: check thread cache
+	if (heap->span_cache[idx])
+		return _memory_span_list_pop(&heap->span_cache[idx]);
+#endif
+	//Step 2: Check reserved spans
+	if (heap->spans_reserved >= span_count)
+		return _memory_map_spans(heap, span_count);
+	//Step 3: Try processing deferred unmappings
+	span_t* span = _memory_unmap_deferred(heap, span_count);
+	if (span)
+		return span;
+#if ENABLE_THREAD_CACHE
+	//Step 4: Check larger super spans and split if we find one
+	for (++idx; idx < LARGE_CLASS_COUNT; ++idx) {
+		if (heap->span_cache[idx]) {
+			span = _memory_span_list_pop(&heap->span_cache[idx]);
+			break;
+		}
+	}
+	if (span) {
+		//Mark the span as owned by this heap before splitting
+		size_t got_count = SPAN_COUNT(span->flags);
+		assert(got_count > span_count);
+		atomic_store32(&span->heap_id, heap->id);
+		atomic_thread_fence_release();
+
+		//Split the span and store as reserved if no previously reserved spans, or in thread cache otherwise
+		span_t* subspan = _memory_span_split(heap, span, span_count);
+		assert((size_t)(SPAN_COUNT(span->flags) + SPAN_COUNT(subspan->flags)) == got_count);
+		assert((size_t)SPAN_COUNT(span->flags) == span_count);
+		if (!heap->spans_reserved) {
+			heap->spans_reserved = got_count - span_count;
+			heap->span_reserve = subspan;
+			heap->span_reserve_master = (span_t*)pointer_offset(subspan, -(int32_t)SPAN_DISTANCE(subspan->flags) * (int32_t)_memory_span_size);
+		}
+		else {
+			_memory_heap_cache_insert(heap, subspan);
+		}
+		return span;
+	}
+#if ENABLE_GLOBAL_CACHE
+	//Step 5: Extract from global cache
+	idx = span_count - 1;
+	heap->span_cache[idx] = _memory_global_cache_extract(span_count);
+	if (heap->span_cache[idx]) {
+#if ENABLE_STATISTICS
+		heap->global_to_thread += (size_t)heap->span_cache[idx]->data.list.size * span_count * _memory_span_size;
+#endif
+		return _memory_span_list_pop(&heap->span_cache[idx]);
+	}
+#endif
+#endif
+	return 0;
+}
+
+//! Allocate a small/medium sized memory block from the given heap
+static void*
+_memory_allocate_from_heap(heap_t* heap, size_t size) {
+	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
+	const size_t base_idx = (size <= SMALL_SIZE_LIMIT) ?
+	                        ((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) :
+	                        SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT);
+	assert(!base_idx || ((base_idx - 1) < SIZE_CLASS_COUNT));
+	const size_t class_idx = _memory_size_class[base_idx ? (base_idx - 1) : 0].class_idx;
+
+	span_block_t* active_block = heap->active_block + class_idx;
+	size_class_t* size_class = _memory_size_class + class_idx;
+	const count_t class_size = size_class->size;
+
+	//Step 1: Try to get a block from the currently active span. The span block bookkeeping
+	//        data for the active span is stored in the heap for faster access
+use_active:
+	if (active_block->free_count) {
+		//Happy path, we have a span with at least one free block
+		span_t* span = heap->active_span[class_idx];
+		count_t offset = class_size * active_block->free_list;
+		uint32_t* block = (uint32_t*)pointer_offset(span, SPAN_HEADER_SIZE + offset);
+		assert(span);
+
+		--active_block->free_count;
+		if (!active_block->free_count) {
+			//Span is now completely allocated, set the bookkeeping data in the
+			//span itself and reset the active span pointer in the heap
+			span->data.block.free_count = 0;
+			span->data.block.first_autolink = (uint16_t)size_class->block_count;
+			heap->active_span[class_idx] = 0;
+		}
+		else {
+			//Get the next free block, either from linked list or from auto link
+			if (active_block->free_list < active_block->first_autolink) {
+				active_block->free_list = (uint16_t)(*block);
+			}
+			else {
+				++active_block->free_list;
+				++active_block->first_autolink;
+			}
+			assert(active_block->free_list < size_class->block_count);
+		}
+
+		return block;
+	}
+
+	//Step 2: No active span, try executing deferred deallocations and try again if there
+	//        was at least one of the requested size class
+	if (_memory_deallocate_deferred(heap, class_idx)) {
+		if (active_block->free_count)
+			goto use_active;
+	}
+
+	//Step 3: Check if there is a semi-used span of the requested size class available
+	if (heap->size_cache[class_idx]) {
+		//Promote a pending semi-used span to be active, storing bookkeeping data in
+		//the heap structure for faster access
+		span_t* span = heap->size_cache[class_idx];
+		*active_block = span->data.block;
+		assert(active_block->free_count > 0);
+		heap->size_cache[class_idx] = span->next_span;
+		heap->active_span[class_idx] = span;
+
+		//Mark span as owned by this heap
+		atomic_store32(&span->heap_id, heap->id);
+		atomic_thread_fence_release();
+
+		goto use_active;
+	}
+
+	//Step 4: Find a span in one of the cache levels
+	span_t* span = _memory_heap_cache_extract(heap, 1);
+	if (!span) {
+		//Step 5: Map in more virtual memory
+		span = _memory_map_spans(heap, 1);
+	}
+
+	//Mark span as owned by this heap and set base data
+	assert(SPAN_COUNT(span->flags) == 1);
+	span->size_class = (uint16_t)class_idx;
+	atomic_store32(&span->heap_id, heap->id);
+	atomic_thread_fence_release();
+
+	//If we only have one block we will grab it, otherwise
+	//set span as new span to use for next allocation
+	if (size_class->block_count > 1) {
+		//Reset block order to sequential auto linked order
+		active_block->free_count = (uint16_t)(size_class->block_count - 1);
+		active_block->free_list = 1;
+		active_block->first_autolink = 1;
+		heap->active_span[class_idx] = span;
+	}
+	else {
+		span->data.block.free_count = 0;
+		span->data.block.first_autolink = (uint16_t)size_class->block_count;
+	}
+
+	//Track counters
+	_memory_counter_increase(&heap->span_counter[0], &_memory_max_allocation[0], 1);
+
+	//Return first block if memory page span
+	return pointer_offset(span, SPAN_HEADER_SIZE);
+}
+
+//! Allocate a large sized memory block from the given heap
+static void*
+_memory_allocate_large_from_heap(heap_t* heap, size_t size) {
+	//Calculate number of needed max sized spans (including header)
+	//Since this function is never called if size > LARGE_SIZE_LIMIT
+	//the span_count is guaranteed to be <= LARGE_CLASS_COUNT
+	size += SPAN_HEADER_SIZE;
+	size_t span_count = size >> _memory_span_size_shift;
+	if (size & (_memory_span_size - 1))
+		++span_count;
+	size_t idx = span_count - 1;
+
+#if ENABLE_THREAD_CACHE
+	if (!heap->span_cache[idx])
+		_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx);
+#else
+	_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx);
+#endif
+	//Step 1: Find span in one of the cache levels
+	span_t* span = _memory_heap_cache_extract(heap, span_count);
+	if (!span) {
+		//Step 2: Map in more virtual memory
+		span = _memory_map_spans(heap, span_count);
+	}
+
+	//Mark span as owned by this heap and set base data
+	assert((size_t)SPAN_COUNT(span->flags) == span_count);
+	span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx);
+	atomic_store32(&span->heap_id, heap->id);
+	atomic_thread_fence_release();
+
+	//Increase counter
+	_memory_counter_increase(&heap->span_counter[idx], &_memory_max_allocation[idx], span_count);
+
+	return pointer_offset(span, SPAN_HEADER_SIZE);
+}
+
+//! Allocate a new heap
+static heap_t*
+_memory_allocate_heap(void) {
+	void* raw_heap;
+	void* next_raw_heap;
+	uintptr_t orphan_counter;
+	heap_t* heap;
+	heap_t* next_heap;
+	//Try getting an orphaned heap
+	atomic_thread_fence_acquire();
+	do {
+		raw_heap = atomic_load_ptr(&_memory_orphan_heaps);
+		heap = (heap_t*)(void*)((uintptr_t)raw_heap & _memory_page_mask);
+		if (!heap)
+			break;
+		next_heap = heap->next_orphan;
+		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
+		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & ~_memory_page_mask));
+	}
+	while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap));
+
+	if (!heap) {
+		//Map in pages for a new heap
+		size_t align_offset = 0;
+		heap = (heap_t*)_memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset);
+		memset(heap, 0, sizeof(heap_t));
+		heap->align_offset = align_offset;
+
+		//Get a new heap ID
+		do {
+			heap->id = atomic_incr32(&_memory_heap_id);
+			if (_memory_heap_lookup(heap->id))
+				heap->id = 0;
+		} while (!heap->id);
+
+		//Link in heap in heap ID map
+		size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
+		do {
+			next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+			heap->next_heap = next_heap;
+		} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
+	}
+
+#if ENABLE_THREAD_CACHE
+	heap->span_counter[0].cache_limit = MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE;
+	for (size_t idx = 1; idx < LARGE_CLASS_COUNT; ++idx)
+		heap->span_counter[idx].cache_limit = MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE;
+#endif
+
+	//Clean up any deferred operations
+	_memory_unmap_deferred(heap, 0);
+	_memory_deallocate_deferred(heap, 0);
+
+	return heap;
+}
+
+//! Deallocate the given small/medium memory block from the given heap
+static void
+_memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) {
+	//Check if span is the currently active span in order to operate
+	//on the correct bookkeeping data
+	assert(SPAN_COUNT(span->flags) == 1);
+	const count_t class_idx = span->size_class;
+	size_class_t* size_class = _memory_size_class + class_idx;
+	int is_active = (heap->active_span[class_idx] == span);
+	span_block_t* block_data = is_active ?
+	                           heap->active_block + class_idx :
+	                           &span->data.block;
+
+	//Check if the span will become completely free
+	if (block_data->free_count == ((count_t)size_class->block_count - 1)) {
+#if ENABLE_THREAD_CACHE
+		//Track counters
+		assert(heap->span_counter[0].current_allocations > 0);
+		if (heap->span_counter[0].current_allocations)
+			--heap->span_counter[0].current_allocations;
+#endif
+
+		//If it was active, reset counter. Otherwise, if not active, remove from
+		//partial free list if we had a previous free block (guard for classes with only 1 block)
+		if (is_active)
+			block_data->free_count = 0;
+		else if (block_data->free_count > 0)
+			_memory_span_list_doublelink_remove(&heap->size_cache[class_idx], span);
+
+		//Add to heap span cache
+		_memory_heap_cache_insert(heap, span);
+		return;
+	}
+
+	//Check if first free block for this span (previously fully allocated)
+	if (block_data->free_count == 0) {
+		//add to free list and disable autolink
+		_memory_span_list_doublelink_add(&heap->size_cache[class_idx], span);
+		block_data->first_autolink = (uint16_t)size_class->block_count;
+	}
+	++block_data->free_count;
+	//Span is not yet completely free, so add block to the linked list of free blocks
+	void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+	count_t block_offset = (count_t)pointer_diff(p, blocks_start);
+	count_t block_idx = block_offset / (count_t)size_class->size;
+	uint32_t* block = (uint32_t*)pointer_offset(blocks_start, block_idx * size_class->size);
+	*block = block_data->free_list;
+	block_data->free_list = (uint16_t)block_idx;
+}
+
+//! Deallocate the given large memory block from the given heap
+static void
+_memory_deallocate_large_to_heap(heap_t* heap, span_t* span) {
+	//Decrease counter
+	size_t idx = (size_t)span->size_class - SIZE_CLASS_COUNT;
+	size_t span_count = idx + 1;
+	assert((size_t)SPAN_COUNT(span->flags) == span_count);
+	assert(span->size_class >= SIZE_CLASS_COUNT);
+	assert(idx < LARGE_CLASS_COUNT);
+#if ENABLE_THREAD_CACHE
+	assert(heap->span_counter[idx].current_allocations > 0);
+	if (heap->span_counter[idx].current_allocations)
+		--heap->span_counter[idx].current_allocations;
+#endif
+	if (!heap->spans_reserved && (span_count > 1)) {
+		//Split the span and store remainder as reserved spans
+		//Must split to a dummy 1-span master since we cannot have master spans as reserved
+		_memory_set_span_remainder_as_reserved(heap, span, 1);
+		span_count = 1;
+	}
+
+	//Insert into cache list
+	_memory_heap_cache_insert(heap, span);
+}
+
+//! Process pending deferred cross-thread deallocations
+static int
+_memory_deallocate_deferred(heap_t* heap, size_t size_class) {
+	//Grab the current list of deferred deallocations
+	atomic_thread_fence_acquire();
+	void* p = atomic_load_ptr(&heap->defer_deallocate);
+	if (!p || !atomic_cas_ptr(&heap->defer_deallocate, 0, p))
+		return 0;
+	//Keep track if we deallocate in the given size class
+	int got_class = 0;
+	do {
+		void* next = *(void**)p;
+		//Get span and check which type of block
+		span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask);
+		if (span->size_class < SIZE_CLASS_COUNT) {
+			//Small/medium block
+			got_class |= (span->size_class == size_class);
+			_memory_deallocate_to_heap(heap, span, p);
+		}
+		else {
+			//Large block
+			got_class |= ((span->size_class >= size_class) && (span->size_class <= (size_class + 2)));
+			_memory_deallocate_large_to_heap(heap, span);
+		}
+		//Loop until all pending operations in list are processed
+		p = next;
+	} while (p);
+	return got_class;
+}
+
+//! Defer deallocation of the given block to the given heap
+static void
+_memory_deallocate_defer(int32_t heap_id, void* p) {
+	//Get the heap and link in pointer in list of deferred operations
+	heap_t* heap = _memory_heap_lookup(heap_id);
+	if (!heap)
+		return;
+	void* last_ptr;
+	do {
+		last_ptr = atomic_load_ptr(&heap->defer_deallocate);
+		*(void**)p = last_ptr; //Safe to use block, it's being deallocated
+	} while (!atomic_cas_ptr(&heap->defer_deallocate, p, last_ptr));
+}
+
+//! Allocate a block of the given size
+static void*
+_memory_allocate(size_t size) {
+	if (size <= _memory_medium_size_limit)
+		return _memory_allocate_from_heap(get_thread_heap(), size);
+	else if (size <= LARGE_SIZE_LIMIT)
+		return _memory_allocate_large_from_heap(get_thread_heap(), size);
+
+	//Oversized, allocate pages directly
+	size += SPAN_HEADER_SIZE;
+	size_t num_pages = size >> _memory_page_size_shift;
+	if (size & (_memory_page_size - 1))
+		++num_pages;
+	size_t align_offset = 0;
+	span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset);
+	atomic_store32(&span->heap_id, 0);
+	//Store page count in next_span
+	span->next_span = (span_t*)((uintptr_t)num_pages);
+	span->data.list.align_offset = (uint16_t)align_offset;
+
+	return pointer_offset(span, SPAN_HEADER_SIZE);
+}
+
+//! Deallocate the given block
+static void
+_memory_deallocate(void* p) {
+	if (!p)
+		return;
+
+	//Grab the span (always at start of span, using 64KiB alignment)
+	span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask);
+	int32_t heap_id = atomic_load32(&span->heap_id);
+	heap_t* heap = get_thread_heap();
+	//Check if block belongs to this heap or if deallocation should be deferred
+	if (heap_id == heap->id) {
+		if (span->size_class < SIZE_CLASS_COUNT)
+			_memory_deallocate_to_heap(heap, span, p);
+		else
+			_memory_deallocate_large_to_heap(heap, span);
+	}
+	else if (heap_id > 0) {
+		_memory_deallocate_defer(heap_id, p);
+	}
+	else {
+		//Oversized allocation, page count is stored in next_span
+		size_t num_pages = (size_t)span->next_span;
+		_memory_unmap(span, num_pages * _memory_page_size, span->data.list.align_offset, 1);
+	}
+}
+
+//! Reallocate the given block to the given size
+static void*
+_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
+	if (p) {
+		//Grab the span using guaranteed span alignment
+		span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask);
+		int32_t heap_id = atomic_load32(&span->heap_id);
+		if (heap_id) {
+			if (span->size_class < SIZE_CLASS_COUNT) {
+				//Small/medium sized block
+				size_class_t* size_class = _memory_size_class + span->size_class;
+				if ((size_t)size_class->size >= size)
+					return p; //Still fits in block, never mind trying to save memory
+				if (!oldsize)
+					oldsize = size_class->size;
+			}
+			else {
+				//Large block
+				size_t total_size = size + SPAN_HEADER_SIZE;
+				size_t num_spans = total_size >> _memory_span_size_shift;
+				if (total_size & (_memory_span_mask - 1))
+					++num_spans;
+				size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
+				if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2)))
+					return p; //Still fits and less than half of memory would be freed
+				if (!oldsize)
+					oldsize = (current_spans * _memory_span_size) - SPAN_HEADER_SIZE;
+			}
+		}
+		else {
+			//Oversized block
+			size_t total_size = size + SPAN_HEADER_SIZE;
+			size_t num_pages = total_size >> _memory_page_size_shift;
+			if (total_size & (_memory_page_size - 1))
+				++num_pages;
+			//Page count is stored in next_span
+			size_t current_pages = (size_t)span->next_span;
+			if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2)))
+				return p; //Still fits and less than half of memory would be freed
+			if (!oldsize)
+				oldsize = (current_pages * _memory_page_size) - SPAN_HEADER_SIZE;
+		}
+	}
+
+	//Size is greater than block size, need to allocate a new block and deallocate the old
+	//Avoid hysteresis by overallocating if increase is small (below 37%)
+	size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
+	void* block = _memory_allocate((size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size));
+	if (p) {
+		if (!(flags & RPMALLOC_NO_PRESERVE))
+			memcpy(block, p, oldsize < size ? oldsize : size);
+		_memory_deallocate(p);
+	}
+
+	return block;
+}
+
+//! Get the usable size of the given block
+static size_t
+_memory_usable_size(void* p) {
+	//Grab the span using guaranteed span alignment
+	span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask);
+	int32_t heap_id = atomic_load32(&span->heap_id);
+	if (heap_id) {
+		//Small/medium block
+		if (span->size_class < SIZE_CLASS_COUNT)
+			return _memory_size_class[span->size_class].size;
+
+		//Large block
+		size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
+		return (current_spans * _memory_span_size) - SPAN_HEADER_SIZE;
+	}
+
+	//Oversized block, page count is stored in next_span
+	size_t current_pages = (size_t)span->next_span;
+	return (current_pages * _memory_page_size) - SPAN_HEADER_SIZE;
+}
+
+//! Adjust and optimize the size class properties for the given class
+static void
+_memory_adjust_size_class(size_t iclass) {
+	size_t block_size = _memory_size_class[iclass].size;
+	size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size;
+
+	_memory_size_class[iclass].block_count = (uint16_t)block_count;
+	_memory_size_class[iclass].class_idx = (uint16_t)iclass;
+
+	//Check if previous size classes can be merged
+	size_t prevclass = iclass;
+	while (prevclass > 0) {
+		--prevclass;
+		//A class can be merged if number of pages and number of blocks are equal
+		if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) {
+			memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
+		}
+		else {
+			break;
+		}
+	}
+}
+
+}
+
+#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
+#  include <windows.h>
+#else
+#  include <sys/mman.h>
+#  include <sched.h>
+#  ifndef MAP_UNINITIALIZED
+#    define MAP_UNINITIALIZED 0
+#  endif
+#endif
+#include <errno.h>
+
+namespace tracy
+{
+
+//! Initialize the allocator and setup global data
+int
+rpmalloc_initialize(void) {
+	memset(&_memory_config, 0, sizeof(rpmalloc_config_t));
+	return rpmalloc_initialize_config(0);
+}
+
+int
+rpmalloc_initialize_config(const rpmalloc_config_t* config) {
+	if (config)
+		memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
+
+	if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
+		_memory_config.memory_map = _memory_map_os;
+		_memory_config.memory_unmap = _memory_unmap_os;
+	}
+
+	_memory_page_size = _memory_config.page_size;
+	if (!_memory_page_size) {
+#if PLATFORM_WINDOWS
+		SYSTEM_INFO system_info;
+		memset(&system_info, 0, sizeof(system_info));
+		GetSystemInfo(&system_info);
+		_memory_page_size = system_info.dwPageSize;
+		_memory_map_granularity = system_info.dwAllocationGranularity;
+#else
+		_memory_page_size = (size_t)sysconf(_SC_PAGESIZE);
+		_memory_map_granularity = _memory_page_size;
+#endif
+	}
+
+	if (_memory_page_size < 512)
+		_memory_page_size = 512;
+	if (_memory_page_size > (16 * 1024))
+		_memory_page_size = (16 * 1024);
+
+	_memory_page_size_shift = 0;
+	size_t page_size_bit = _memory_page_size;
+	while (page_size_bit != 1) {
+		++_memory_page_size_shift;
+		page_size_bit >>= 1;
+	}
+	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
+	_memory_page_mask = ~(uintptr_t)(_memory_page_size - 1);
+
+	size_t span_size = _memory_config.span_size;
+	if (!span_size)
+		span_size = (64 * 1024);
+	if (span_size > (256 * 1024))
+		span_size = (256 * 1024);
+	_memory_span_size = 4096;
+	_memory_span_size_shift = 12;
+	while ((_memory_span_size < span_size) || (_memory_span_size < _memory_page_size)) {
+		_memory_span_size <<= 1;
+		++_memory_span_size_shift;
+	}
+	_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
+
+	_memory_config.page_size = _memory_page_size;
+	_memory_config.span_size = _memory_span_size;
+
+	if (!_memory_config.span_map_count)
+		_memory_config.span_map_count = DEFAULT_SPAN_MAP_COUNT;
+	if (_memory_config.span_size * _memory_config.span_map_count < _memory_config.page_size)
+		_memory_config.span_map_count = (_memory_config.page_size / _memory_config.span_size);
+	if (_memory_config.span_map_count > 128)
+		_memory_config.span_map_count = 128;
+
+#if defined(__APPLE__) && ENABLE_PRELOAD
+	if (pthread_key_create(&_memory_thread_heap, 0))
+		return -1;
+#endif
+
+	atomic_store32(&_memory_heap_id, 0);
+	atomic_store32(&_memory_orphan_counter, 0);
+	atomic_store32(&_memory_active_heaps, 0);
+
+	//Setup all small and medium size classes
+	size_t iclass;
+	for (iclass = 0; iclass < SMALL_CLASS_COUNT; ++iclass) {
+		size_t size = (iclass + 1) * SMALL_GRANULARITY;
+		_memory_size_class[iclass].size = (uint16_t)size;
+		_memory_adjust_size_class(iclass);
+	}
+
+	_memory_medium_size_limit = _memory_span_size - SPAN_HEADER_SIZE;
+	if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT)
+		_memory_medium_size_limit = MEDIUM_SIZE_LIMIT;
+	for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) {
+		size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY);
+		if (size > _memory_medium_size_limit)
+			size = _memory_medium_size_limit;
+		_memory_size_class[SMALL_CLASS_COUNT + iclass].size = (uint16_t)size;
+		_memory_adjust_size_class(SMALL_CLASS_COUNT + iclass);
+	}
+
+	//Initialize this thread
+	rpmalloc_thread_initialize();
+	return 0;
+}
+
+//! Finalize the allocator
+void
+rpmalloc_finalize(void) {
+	atomic_thread_fence_acquire();
+
+	rpmalloc_thread_finalize();
+	//If you hit this assert, you still have active threads or forgot to finalize some thread(s)
+	assert(atomic_load32(&_memory_active_heaps) == 0);
+
+	//Free all thread caches
+	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
+		heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+		while (heap) {
+			_memory_deallocate_deferred(heap, 0);
+
+			//Free span caches (other thread might have deferred after the thread using this heap finalized)
+#if ENABLE_THREAD_CACHE
+			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+				if (heap->span_cache[iclass])
+					_memory_unmap_span_list(0, heap->span_cache[iclass]);
+			}
+#endif
+			heap = heap->next_heap;
+		}
+	}
+
+#if ENABLE_GLOBAL_CACHE
+	//Free global caches
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		_memory_cache_finalize(&_memory_span_cache[iclass]);
+#endif
+
+	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
+		heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+		atomic_store_ptr(&_memory_heaps[list_idx], 0);
+		while (heap) {
+			if (heap->spans_reserved) {
+				span_t* span = heap->span_reserve;
+				span_t* master = heap->span_reserve_master;
+				uint32_t remains = SPAN_REMAINS(master->flags);
+
+				assert(master != span);
+				assert(remains >= heap->spans_reserved);
+				_memory_unmap(span, heap->spans_reserved * _memory_span_size, 0, 0);
+				_memory_statistics_sub(&_reserved_spans, heap->spans_reserved);
+				remains = ((uint32_t)heap->spans_reserved >= remains) ? 0 : (remains - (uint32_t)heap->spans_reserved);
+				if (!remains) {
+					uint32_t master_span_count = SPAN_COUNT(master->flags);
+					_memory_statistics_sub(&_reserved_spans, master_span_count);
+					_memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1);
+				}
+				else {
+					SPAN_SET_REMAINS(master->flags, remains);
+				}
+			}
+
+			_memory_unmap_deferred(heap, 0);
+
+			heap_t* next_heap = heap->next_heap;
+			_memory_unmap(heap, (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, heap->align_offset, 1);
+			heap = next_heap;
+		}
+	}
+	atomic_store_ptr(&_memory_orphan_heaps, 0);
+	atomic_thread_fence_release();
+
+#if ENABLE_STATISTICS
+	//If you hit these asserts you probably have memory leaks or double frees in your code
+	assert(!atomic_load32(&_mapped_pages));
+	assert(!atomic_load32(&_reserved_spans));
+#endif
+
+#if defined(__APPLE__) && ENABLE_PRELOAD
+	pthread_key_delete(_memory_thread_heap);
+#endif
+}
+
+//! Initialize thread, assign heap
+void
+rpmalloc_thread_initialize(void) {
+	if (!get_thread_heap()) {
+		atomic_incr32(&_memory_active_heaps);
+		heap_t* heap = _memory_allocate_heap();
+#if ENABLE_STATISTICS
+		heap->thread_to_global = 0;
+		heap->global_to_thread = 0;
+#endif
+		set_thread_heap(heap);
+	}
+}
+
+//! Finalize thread, orphan heap
+void
+rpmalloc_thread_finalize(void) {
+	heap_t* heap = get_thread_heap();
+	if (!heap)
+		return;
+
+	_memory_deallocate_deferred(heap, 0);
+	_memory_unmap_deferred(heap, 0);
+
+	//Release thread cache spans back to global cache
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_t* span = heap->span_cache[iclass];
+#if ENABLE_GLOBAL_CACHE
+		const size_t span_count = iclass + 1;
+		while (span) {
+			assert((size_t)SPAN_COUNT(span->flags) == span_count);
+			span_t* next = _memory_span_list_split(span, !iclass ? MIN_SPAN_CACHE_RELEASE : (MIN_LARGE_SPAN_CACHE_RELEASE / span_count));
+			_memory_global_cache_insert(0, span);
+			span = next;
+		}
+#else
+		if (span)
+			_memory_unmap_span_list(heap, span);
+#endif
+		heap->span_cache[iclass] = 0;
+	}
+#endif
+
+	//Orphan the heap
+	void* raw_heap;
+	uintptr_t orphan_counter;
+	heap_t* last_heap;
+	do {
+		last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps);
+		heap->next_orphan = (heap_t*)(void*)((uintptr_t)last_heap & _memory_page_mask);
+		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
+		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & ~_memory_page_mask));
+	}
+	while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
+
+	set_thread_heap(0);
+	atomic_add32(&_memory_active_heaps, -1);
+}
+
+int
+rpmalloc_is_thread_initialized(void) {
+	return (get_thread_heap() != 0) ? 1 : 0;
+}
+
+const rpmalloc_config_t*
+rpmalloc_config(void) {
+	return &_memory_config;
+}
+
+//! Map new pages to virtual memory
+static void*
+_memory_map_os(size_t size, size_t* offset) {
+	//Either size is a heap (a single page) or a (multiple) span - we only need to align spans
+	size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0;
+
+#if PLATFORM_WINDOWS
+	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
+	void* ptr = VirtualAlloc(0, size + padding, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+	if (!ptr) {
+		assert("Failed to map virtual memory block" && 0);
+		return 0;
+	}
+#else
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0);
+	if ((ptr == MAP_FAILED) || !ptr) {
+		assert("Failed to map virtual memory block" && 0);
+		return 0;
+	}
+#endif
+
+	if (padding) {
+		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
+#if PLATFORM_POSIX
+		//Unmap the last unused pages, for Windows this is done with the final VirtualFree with MEM_RELEASE call
+		size_t remains = padding - final_padding;
+		if (remains)
+			munmap(pointer_offset(ptr, final_padding + size), remains);
+#endif
+		ptr = pointer_offset(ptr, final_padding);
+		assert(final_padding <= _memory_span_size);
+		assert(!(final_padding & 5));
+		assert(!((uintptr_t)ptr & ~_memory_span_mask));
+		*offset = final_padding >> 3;
+		assert(*offset < 65536);
+	}
+
+	return ptr;
+}
+
+//! Unmap pages from virtual memory
+static void
+_memory_unmap_os(void* address, size_t size, size_t offset, int release) {
+	assert(release || (offset == 0));
+	if (release && offset) {
+		offset <<= 3;
+#if PLATFORM_POSIX
+		size += offset;
+#endif
+		address = pointer_offset(address, -(int32_t)offset);
+	}
+#if PLATFORM_WINDOWS
+	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
+		DWORD err = GetLastError();
+		(void)err;
+		assert("Failed to unmap virtual memory block" && 0);
+	}
+#else
+	MEMORY_UNUSED(release);
+	if (munmap(address, size)) {
+		assert("Failed to unmap virtual memory block" && 0);
+	}
+#endif
+}
+
+#if ENABLE_GUARDS
+static void
+_memory_guard_validate(void* p) {
+	if (!p)
+		return;
+	void* block_start;
+	size_t block_size = _memory_usable_size(p);
+	span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
+	int32_t heap_id = atomic_load32(&span->heap_id);
+	if (heap_id) {
+		if (span->size_class < SIZE_CLASS_COUNT) {
+			void* span_blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+			size_class_t* size_class = _memory_size_class + span->size_class;
+			count_t block_offset = (count_t)pointer_diff(p, span_blocks_start);
+			count_t block_idx = block_offset / (count_t)size_class->size;
+			block_start = pointer_offset(span_blocks_start, block_idx * size_class->size);
+		}
+		else {
+			block_start = pointer_offset(span, SPAN_HEADER_SIZE);
+		}
+	}
+	else {
+		block_start = pointer_offset(span, SPAN_HEADER_SIZE);
+	}
+	uint32_t* deadzone = block_start;
+	//If these asserts fire, you have written to memory before the block start
+	for (int i = 0; i < 8; ++i) {
+		if (deadzone[i] != MAGIC_GUARD) {
+			if (_memory_config.memory_overwrite)
+				_memory_config.memory_overwrite(p);
+			else
+				assert("Memory overwrite before block start" && 0);
+			return;
+		}
+		deadzone[i] = 0;
+	}
+	deadzone = (uint32_t*)pointer_offset(block_start, block_size - 32);
+	//If these asserts fire, you have written to memory after the block end
+	for (int i = 0; i < 8; ++i) {
+		if (deadzone[i] != MAGIC_GUARD) {
+			if (_memory_config.memory_overwrite)
+				_memory_config.memory_overwrite(p);
+			else
+				assert("Memory overwrite after block end" && 0);
+			return;
+		}
+		deadzone[i] = 0;
+	}
+}
+#else
+#define _memory_guard_validate(block)
+#endif
+
+#if ENABLE_GUARDS
+static void
+_memory_guard_block(void* block) {
+	if (block) {
+		size_t block_size = _memory_usable_size(block);
+		uint32_t* deadzone = block;
+		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] =
+		deadzone[4] = deadzone[5] = deadzone[6] = deadzone[7] = MAGIC_GUARD;
+		deadzone = (uint32_t*)pointer_offset(block, block_size - 32);
+		deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] =
+		deadzone[4] = deadzone[5] = deadzone[6] = deadzone[7] = MAGIC_GUARD;
+	}
+}
+#define _memory_guard_pre_alloc(size) size += 64
+#define _memory_guard_pre_realloc(block, size) block = pointer_offset(block, -32); size += 64
+#define _memory_guard_post_alloc(block, size) _memory_guard_block(block); block = pointer_offset(block, 32); size -= 64
+#else
+#define _memory_guard_pre_alloc(size)
+#define _memory_guard_pre_realloc(block, size)
+#define _memory_guard_post_alloc(block, size)
+#endif
+
+// Extern interface
+
+TRACY_API RPMALLOC_RESTRICT void*
+rpmalloc(size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	_memory_guard_pre_alloc(size);
+	void* block = _memory_allocate(size);
+	_memory_guard_post_alloc(block, size);
+	return block;
+}
+
+TRACY_API void
+rpfree(void* ptr) {
+	_memory_guard_validate(ptr);
+	_memory_deallocate(ptr);
+}
+
+RPMALLOC_RESTRICT void*
+rpcalloc(size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	_memory_guard_pre_alloc(total);
+	void* block = _memory_allocate(total);
+	_memory_guard_post_alloc(block, total);
+	memset(block, 0, total);
+	return block;
+}
+
+void*
+rprealloc(void* ptr, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	_memory_guard_validate(ptr);
+	_memory_guard_pre_realloc(ptr, size);
+	void* block = _memory_reallocate(ptr, size, 0, 0);
+	_memory_guard_post_alloc(block, size);
+	return block;
+}
+
+void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
+                  unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	void* block;
+	if (alignment > 32) {
+		block = rpaligned_alloc(alignment, size);
+		if (!(flags & RPMALLOC_NO_PRESERVE))
+			memcpy(block, ptr, oldsize < size ? oldsize : size);
+		rpfree(ptr);
+	}
+	else {
+		_memory_guard_validate(ptr);
+		_memory_guard_pre_realloc(ptr, size);
+		block = _memory_reallocate(ptr, size, oldsize, flags);
+		_memory_guard_post_alloc(block, size);
+	}
+	return block;
+}
+
+RPMALLOC_RESTRICT void*
+rpaligned_alloc(size_t alignment, size_t size) {
+	if (alignment <= 32)
+		return rpmalloc(size);
+
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+
+	void* ptr = rpmalloc(size + alignment);
+	if ((uintptr_t)ptr & (alignment - 1))
+		ptr = (void*)(((uintptr_t)ptr & ~((uintptr_t)alignment - 1)) + alignment);
+	return ptr;
+}
+
+RPMALLOC_RESTRICT void*
+rpmemalign(size_t alignment, size_t size) {
+	return rpaligned_alloc(alignment, size);
+}
+
+int
+rpposix_memalign(void **memptr, size_t alignment, size_t size) {
+	if (memptr)
+		*memptr = rpaligned_alloc(alignment, size);
+	else
+		return EINVAL;
+	return *memptr ? 0 : ENOMEM;
+}
+
+size_t
+rpmalloc_usable_size(void* ptr) {
+	size_t size = 0;
+	if (ptr) {
+		size = _memory_usable_size(ptr);
+#if ENABLE_GUARDS
+		size -= 64;
+#endif
+	}
+	return size;
+}
+
+void
+rpmalloc_thread_collect(void) {
+	heap_t* heap = get_thread_heap();
+	_memory_unmap_deferred(heap, 0);
+	_memory_deallocate_deferred(0, 0);
+}
+
+void
+rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
+	memset(stats, 0, sizeof(rpmalloc_thread_statistics_t));
+	heap_t* heap = get_thread_heap();
+	void* p = atomic_load_ptr(&heap->defer_deallocate);
+	while (p) {
+		void* next = *(void**)p;
+		span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask);
+		stats->deferred += _memory_size_class[span->size_class].size;
+		p = next;
+	}
+
+	for (size_t isize = 0; isize < SIZE_CLASS_COUNT; ++isize) {
+		if (heap->active_block[isize].free_count)
+			stats->active += heap->active_block[isize].free_count * _memory_size_class[heap->active_span[isize]->size_class].size;
+
+		span_t* cache = heap->size_cache[isize];
+		while (cache) {
+			stats->sizecache = cache->data.block.free_count * _memory_size_class[cache->size_class].size;
+			cache = cache->next_span;
+		}
+	}
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		if (heap->span_cache[iclass])
+			stats->spancache = (size_t)heap->span_cache[iclass]->data.list.size * (iclass + 1) * _memory_span_size;
+	}
+#endif
+}
+
+void
+rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
+	memset(stats, 0, sizeof(rpmalloc_global_statistics_t));
+#if ENABLE_STATISTICS
+	stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
+	stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
+	stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
+#endif
+#if ENABLE_GLOBAL_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size;
+	}
+#endif
+}
+
+}
+
+#ifdef _MSC_VER
+#  pragma warning( pop )
+#endif
+
+#endif
diff --git a/libs/tracy/client/tracy_rpmalloc.hpp b/libs/tracy/client/tracy_rpmalloc.hpp
@@ -0,0 +1,153 @@
+/* rpmalloc.h  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson / Rampant Pixels
+ *
+ * This library provides a cross-platform lock free thread caching malloc implementation in C11.
+ * The latest source code is always available at
+ *
+ * https://github.com/rampantpixels/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
+ *
+ */
+
+#pragma once
+
+#include <stddef.h>
+
+#include "../common/TracyApi.h"
+
+namespace tracy
+{
+
+#if defined(__clang__) || defined(__GNUC__)
+# define RPMALLOC_ATTRIBUTE __attribute__((__malloc__))
+# define RPMALLOC_RESTRICT
+# define RPMALLOC_CDECL
+#elif defined(_MSC_VER)
+# define RPMALLOC_ATTRIBUTE
+# define RPMALLOC_RESTRICT __declspec(restrict)
+# define RPMALLOC_CDECL __cdecl
+#else
+# define RPMALLOC_ATTRIBUTE
+# define RPMALLOC_RESTRICT
+# define RPMALLOC_CDECL
+#endif
+
+//! Flag to rpaligned_realloc to not preserve content in reallocation
+#define RPMALLOC_NO_PRESERVE    1
+
+typedef struct rpmalloc_global_statistics_t {
+	//! Current amount of virtual memory mapped (only if ENABLE_STATISTICS=1)
+	size_t mapped;
+	//! Current amount of memory in global caches for small and medium sizes (<64KiB)
+	size_t cached;
+	//! Total amount of memory mapped (only if ENABLE_STATISTICS=1)
+	size_t mapped_total;
+	//! Total amount of memory unmapped (only if ENABLE_STATISTICS=1)
+	size_t unmapped_total;
+} rpmalloc_global_statistics_t;
+
+typedef struct rpmalloc_thread_statistics_t {
+	//! Current number of bytes available for allocation from active spans
+	size_t active;
+	//! Current number of bytes available in thread size class caches
+	size_t sizecache;
+	//! Current number of bytes available in thread span caches
+	size_t spancache;
+	//! Current number of bytes in pending deferred deallocations
+	size_t deferred;
+	//! Total number of bytes transitioned from thread cache to global cache
+	size_t thread_to_global;
+	//! Total number of bytes transitioned from global cache to thread cache
+	size_t global_to_thread;
+} rpmalloc_thread_statistics_t;
+
+typedef struct rpmalloc_config_t {
+	//! Map memory pages for the given number of bytes. The returned address MUST be
+	//  aligned to the rpmalloc span size, which will always be a power of two.
+	//  Optionally the function can store an alignment offset in the offset variable
+	//  in case it performs alignment and the returned pointer is offset from the
+	//  actual start of the memory region due to this alignment. The alignment offset
+	//  will be passed to the memory unmap function. The alignment offset MUST NOT be
+	//  larger than 65535 (storable in an uint16_t), if it is you must use natural
+	//  alignment to shift it into 16 bits.
+	void* (*memory_map)(size_t size, size_t* offset);
+	//! Unmap the memory pages starting at address and spanning the given number of bytes.
+	//  If release is set to 1, the unmap is for an entire span range as returned by
+	//  a previous call to memory_map and that the entire range should be released.
+	//  If release is set to 0, the unmap is a partial decommit of a subset of the mapped
+	//  memory range.
+	void (*memory_unmap)(void* address, size_t size, size_t offset, int release);
+	//! Size of memory pages. The page size MUST be a power of two in [512,16384] range
+	//  (2^9 to 2^14) unless 0 - set to 0 to use system page size. All memory mapping
+	//  requests to memory_map will be made with size set to a multiple of the page size.
+	size_t page_size;
+	//! Size of a span of memory pages. MUST be a multiple of page size, and in [4096,262144]
+	//  range (unless 0 - set to 0 to use the default span size).
+	size_t span_size;
+	//! Number of spans to map at each request to map new virtual memory blocks. This can
+	//  be used to minimize the system call overhead at the cost of virtual memory address
+	//  space. The extra mapped pages will not be written until actually used, so physical
+	//  committed memory should not be affected in the default implementation.
+	size_t span_map_count;
+	//! Debug callback if memory guards are enabled. Called if a memory overwrite is detected
+	void (*memory_overwrite)(void* address);
+} rpmalloc_config_t;
+
+extern int
+rpmalloc_initialize(void);
+
+extern int
+rpmalloc_initialize_config(const rpmalloc_config_t* config);
+
+extern const rpmalloc_config_t*
+rpmalloc_config(void);
+
+extern void
+rpmalloc_finalize(void);
+
+void
+rpmalloc_thread_initialize(void);
+
+extern void
+rpmalloc_thread_finalize(void);
+
+extern void
+rpmalloc_thread_collect(void);
+
+extern int
+rpmalloc_is_thread_initialized(void);
+
+extern void
+rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats);
+
+extern void
+rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats);
+
+TRACY_API RPMALLOC_RESTRICT void*
+rpmalloc(size_t size) RPMALLOC_ATTRIBUTE;
+
+TRACY_API void
+rpfree(void* ptr);
+
+extern RPMALLOC_RESTRICT void*
+rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIBUTE;
+
+extern void*
+rprealloc(void* ptr, size_t size);
+
+extern void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags);
+
+extern RPMALLOC_RESTRICT void*
+rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE;
+
+extern RPMALLOC_RESTRICT void*
+rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE;
+
+extern int
+rpposix_memalign(void **memptr, size_t alignment, size_t size);
+
+extern size_t
+rpmalloc_usable_size(void* ptr);
+
+}
diff --git a/libs/tracy/common/TracyAlign.hpp b/libs/tracy/common/TracyAlign.hpp
@@ -0,0 +1,27 @@
+#ifndef __TRACYALIGN_HPP__
+#define __TRACYALIGN_HPP__
+
+#include <string.h>
+
+#include "TracyForceInline.hpp"
+
+namespace tracy
+{
+
+template<typename T>
+tracy_force_inline T MemRead( const void* ptr )
+{
+    T val;
+    memcpy( &val, ptr, sizeof( T ) );
+    return val;
+}
+
+template<typename T>
+tracy_force_inline void MemWrite( void* ptr, T val )
+{
+    memcpy( ptr, &val, sizeof( T ) );
+}
+
+}
+
+#endif
diff --git a/libs/tracy/common/TracyAlloc.hpp b/libs/tracy/common/TracyAlloc.hpp
@@ -0,0 +1,33 @@
+#ifndef __TRACYALLOC_HPP__
+#define __TRACYALLOC_HPP__
+
+#include <stdlib.h>
+
+#ifdef TRACY_ENABLE
+#  include "../client/tracy_rpmalloc.hpp"
+#endif
+
+namespace tracy
+{
+
+static inline void* tracy_malloc( size_t size )
+{
+#ifdef TRACY_ENABLE
+    return rpmalloc( size );
+#else
+    return malloc( size );
+#endif
+}
+
+static inline void tracy_free( void* ptr )
+{
+#ifdef TRACY_ENABLE
+    rpfree( ptr );
+#else
+    free( ptr );
+#endif
+}
+
+}
+
+#endif
diff --git a/libs/tracy/common/TracyApi.h b/libs/tracy/common/TracyApi.h
@@ -0,0 +1,14 @@
+#ifndef __TRACYAPI_H__
+#define __TRACYAPI_H__
+
+#ifdef _WIN32
+#  if defined TRACY_IMPORTS
+#    define TRACY_API __declspec(dllimport)
+#  else
+#    define TRACY_API __declspec(dllexport)
+#  endif
+#else
+#  define TRACY_API __attribute__((visibility("default")))
+#endif
+
+#endif    // __TRACYAPI_H__
diff --git a/libs/tracy/common/TracyColor.hpp b/libs/tracy/common/TracyColor.hpp
@@ -0,0 +1,690 @@
+#ifndef __TRACYCOLOR_HPP__
+#define __TRACYCOLOR_HPP__
+
+namespace tracy
+{
+struct Color
+{
+enum ColorType
+{
+    Snow = 0xfffafa,
+    GhostWhite = 0xf8f8ff,
+    WhiteSmoke = 0xf5f5f5,
+    Gainsboro = 0xdcdcdc,
+    FloralWhite = 0xfffaf0,
+    OldLace = 0xfdf5e6,
+    Linen = 0xfaf0e6,
+    AntiqueWhite = 0xfaebd7,
+    PapayaWhip = 0xffefd5,
+    BlanchedAlmond = 0xffebcd,
+    Bisque = 0xffe4c4,
+    PeachPuff = 0xffdab9,
+    NavajoWhite = 0xffdead,
+    Moccasin = 0xffe4b5,
+    Cornsilk = 0xfff8dc,
+    Ivory = 0xfffff0,
+    LemonChiffon = 0xfffacd,
+    Seashell = 0xfff5ee,
+    Honeydew = 0xf0fff0,
+    MintCream = 0xf5fffa,
+    Azure = 0xf0ffff,
+    AliceBlue = 0xf0f8ff,
+    Lavender = 0xe6e6fa,
+    LavenderBlush = 0xfff0f5,
+    MistyRose = 0xffe4e1,
+    White = 0xffffff,
+    Black = 0x000000,
+    DarkSlateGray = 0x2f4f4f,
+    DarkSlateGrey = 0x2f4f4f,
+    DimGray = 0x696969,
+    DimGrey = 0x696969,
+    SlateGray = 0x708090,
+    SlateGrey = 0x708090,
+    LightSlateGray = 0x778899,
+    LightSlateGrey = 0x778899,
+    Gray = 0xbebebe,
+    Grey = 0xbebebe,
+    X11Gray = 0xbebebe,
+    X11Grey = 0xbebebe,
+    WebGray = 0x808080,
+    WebGrey = 0x808080,
+    LightGrey = 0xd3d3d3,
+    LightGray = 0xd3d3d3,
+    MidnightBlue = 0x191970,
+    Navy = 0x000080,
+    NavyBlue = 0x000080,
+    CornflowerBlue = 0x6495ed,
+    DarkSlateBlue = 0x483d8b,
+    SlateBlue = 0x6a5acd,
+    MediumSlateBlue = 0x7b68ee,
+    LightSlateBlue = 0x8470ff,
+    MediumBlue = 0x0000cd,
+    RoyalBlue = 0x4169e1,
+    Blue = 0x0000ff,
+    DodgerBlue = 0x1e90ff,
+    DeepSkyBlue = 0x00bfff,
+    SkyBlue = 0x87ceeb,
+    LightSkyBlue = 0x87cefa,
+    SteelBlue = 0x4682b4,
+    LightSteelBlue = 0xb0c4de,
+    LightBlue = 0xadd8e6,
+    PowderBlue = 0xb0e0e6,
+    PaleTurquoise = 0xafeeee,
+    DarkTurquoise = 0x00ced1,
+    MediumTurquoise = 0x48d1cc,
+    Turquoise = 0x40e0d0,
+    Cyan = 0x00ffff,
+    Aqua = 0x00ffff,
+    LightCyan = 0xe0ffff,
+    CadetBlue = 0x5f9ea0,
+    MediumAquamarine = 0x66cdaa,
+    Aquamarine = 0x7fffd4,
+    DarkGreen = 0x006400,
+    DarkOliveGreen = 0x556b2f,
+    DarkSeaGreen = 0x8fbc8f,
+    SeaGreen = 0x2e8b57,
+    MediumSeaGreen = 0x3cb371,
+    LightSeaGreen = 0x20b2aa,
+    PaleGreen = 0x98fb98,
+    SpringGreen = 0x00ff7f,
+    LawnGreen = 0x7cfc00,
+    Green = 0x00ff00,
+    Lime = 0x00ff00,
+    X11Green = 0x00ff00,
+    WebGreen = 0x008000,
+    Chartreuse = 0x7fff00,
+    MediumSpringGreen = 0x00fa9a,
+    GreenYellow = 0xadff2f,
+    LimeGreen = 0x32cd32,
+    YellowGreen = 0x9acd32,
+    ForestGreen = 0x228b22,
+    OliveDrab = 0x6b8e23,
+    DarkKhaki = 0xbdb76b,
+    Khaki = 0xf0e68c,
+    PaleGoldenrod = 0xeee8aa,
+    LightGoldenrodYellow = 0xfafad2,
+    LightYellow = 0xffffe0,
+    Yellow = 0xffff00,
+    Gold = 0xffd700,
+    LightGoldenrod = 0xeedd82,
+    Goldenrod = 0xdaa520,
+    DarkGoldenrod = 0xb8860b,
+    RosyBrown = 0xbc8f8f,
+    IndianRed = 0xcd5c5c,
+    SaddleBrown = 0x8b4513,
+    Sienna = 0xa0522d,
+    Peru = 0xcd853f,
+    Burlywood = 0xdeb887,
+    Beige = 0xf5f5dc,
+    Wheat = 0xf5deb3,
+    SandyBrown = 0xf4a460,
+    Tan = 0xd2b48c,
+    Chocolate = 0xd2691e,
+    Firebrick = 0xb22222,
+    Brown = 0xa52a2a,
+    DarkSalmon = 0xe9967a,
+    Salmon = 0xfa8072,
+    LightSalmon = 0xffa07a,
+    Orange = 0xffa500,
+    DarkOrange = 0xff8c00,
+    Coral = 0xff7f50,
+    LightCoral = 0xf08080,
+    Tomato = 0xff6347,
+    OrangeRed = 0xff4500,
+    Red = 0xff0000,
+    HotPink = 0xff69b4,
+    DeepPink = 0xff1493,
+    Pink = 0xffc0cb,
+    LightPink = 0xffb6c1,
+    PaleVioletRed = 0xdb7093,
+    Maroon = 0xb03060,
+    X11Maroon = 0xb03060,
+    WebMaroon = 0x800000,
+    MediumVioletRed = 0xc71585,
+    VioletRed = 0xd02090,
+    Magenta = 0xff00ff,
+    Fuchsia = 0xff00ff,
+    Violet = 0xee82ee,
+    Plum = 0xdda0dd,
+    Orchid = 0xda70d6,
+    MediumOrchid = 0xba55d3,
+    DarkOrchid = 0x9932cc,
+    DarkViolet = 0x9400d3,
+    BlueViolet = 0x8a2be2,
+    Purple = 0xa020f0,
+    X11Purple = 0xa020f0,
+    WebPurple = 0x800080,
+    MediumPurple = 0x9370db,
+    Thistle = 0xd8bfd8,
+    Snow1 = 0xfffafa,
+    Snow2 = 0xeee9e9,
+    Snow3 = 0xcdc9c9,
+    Snow4 = 0x8b8989,
+    Seashell1 = 0xfff5ee,
+    Seashell2 = 0xeee5de,
+    Seashell3 = 0xcdc5bf,
+    Seashell4 = 0x8b8682,
+    AntiqueWhite1 = 0xffefdb,
+    AntiqueWhite2 = 0xeedfcc,
+    AntiqueWhite3 = 0xcdc0b0,
+    AntiqueWhite4 = 0x8b8378,
+    Bisque1 = 0xffe4c4,
+    Bisque2 = 0xeed5b7,
+    Bisque3 = 0xcdb79e,
+    Bisque4 = 0x8b7d6b,
+    PeachPuff1 = 0xffdab9,
+    PeachPuff2 = 0xeecbad,
+    PeachPuff3 = 0xcdaf95,
+    PeachPuff4 = 0x8b7765,
+    NavajoWhite1 = 0xffdead,
+    NavajoWhite2 = 0xeecfa1,
+    NavajoWhite3 = 0xcdb38b,
+    NavajoWhite4 = 0x8b795e,
+    LemonChiffon1 = 0xfffacd,
+    LemonChiffon2 = 0xeee9bf,
+    LemonChiffon3 = 0xcdc9a5,
+    LemonChiffon4 = 0x8b8970,
+    Cornsilk1 = 0xfff8dc,
+    Cornsilk2 = 0xeee8cd,
+    Cornsilk3 = 0xcdc8b1,
+    Cornsilk4 = 0x8b8878,
+    Ivory1 = 0xfffff0,
+    Ivory2 = 0xeeeee0,
+    Ivory3 = 0xcdcdc1,
+    Ivory4 = 0x8b8b83,
+    Honeydew1 = 0xf0fff0,
+    Honeydew2 = 0xe0eee0,
+    Honeydew3 = 0xc1cdc1,
+    Honeydew4 = 0x838b83,
+    LavenderBlush1 = 0xfff0f5,
+    LavenderBlush2 = 0xeee0e5,
+    LavenderBlush3 = 0xcdc1c5,
+    LavenderBlush4 = 0x8b8386,
+    MistyRose1 = 0xffe4e1,
+    MistyRose2 = 0xeed5d2,
+    MistyRose3 = 0xcdb7b5,
+    MistyRose4 = 0x8b7d7b,
+    Azure1 = 0xf0ffff,
+    Azure2 = 0xe0eeee,
+    Azure3 = 0xc1cdcd,
+    Azure4 = 0x838b8b,
+    SlateBlue1 = 0x836fff,
+    SlateBlue2 = 0x7a67ee,
+    SlateBlue3 = 0x6959cd,
+    SlateBlue4 = 0x473c8b,
+    RoyalBlue1 = 0x4876ff,
+    RoyalBlue2 = 0x436eee,
+    RoyalBlue3 = 0x3a5fcd,
+    RoyalBlue4 = 0x27408b,
+    Blue1 = 0x0000ff,
+    Blue2 = 0x0000ee,
+    Blue3 = 0x0000cd,
+    Blue4 = 0x00008b,
+    DodgerBlue1 = 0x1e90ff,
+    DodgerBlue2 = 0x1c86ee,
+    DodgerBlue3 = 0x1874cd,
+    DodgerBlue4 = 0x104e8b,
+    SteelBlue1 = 0x63b8ff,
+    SteelBlue2 = 0x5cacee,
+    SteelBlue3 = 0x4f94cd,
+    SteelBlue4 = 0x36648b,
+    DeepSkyBlue1 = 0x00bfff,
+    DeepSkyBlue2 = 0x00b2ee,
+    DeepSkyBlue3 = 0x009acd,
+    DeepSkyBlue4 = 0x00688b,
+    SkyBlue1 = 0x87ceff,
+    SkyBlue2 = 0x7ec0ee,
+    SkyBlue3 = 0x6ca6cd,
+    SkyBlue4 = 0x4a708b,
+    LightSkyBlue1 = 0xb0e2ff,
+    LightSkyBlue2 = 0xa4d3ee,
+    LightSkyBlue3 = 0x8db6cd,
+    LightSkyBlue4 = 0x607b8b,
+    SlateGray1 = 0xc6e2ff,
+    SlateGray2 = 0xb9d3ee,
+    SlateGray3 = 0x9fb6cd,
+    SlateGray4 = 0x6c7b8b,
+    LightSteelBlue1 = 0xcae1ff,
+    LightSteelBlue2 = 0xbcd2ee,
+    LightSteelBlue3 = 0xa2b5cd,
+    LightSteelBlue4 = 0x6e7b8b,
+    LightBlue1 = 0xbfefff,
+    LightBlue2 = 0xb2dfee,
+    LightBlue3 = 0x9ac0cd,
+    LightBlue4 = 0x68838b,
+    LightCyan1 = 0xe0ffff,
+    LightCyan2 = 0xd1eeee,
+    LightCyan3 = 0xb4cdcd,
+    LightCyan4 = 0x7a8b8b,
+    PaleTurquoise1 = 0xbbffff,
+    PaleTurquoise2 = 0xaeeeee,
+    PaleTurquoise3 = 0x96cdcd,
+    PaleTurquoise4 = 0x668b8b,
+    CadetBlue1 = 0x98f5ff,
+    CadetBlue2 = 0x8ee5ee,
+    CadetBlue3 = 0x7ac5cd,
+    CadetBlue4 = 0x53868b,
+    Turquoise1 = 0x00f5ff,
+    Turquoise2 = 0x00e5ee,
+    Turquoise3 = 0x00c5cd,
+    Turquoise4 = 0x00868b,
+    Cyan1 = 0x00ffff,
+    Cyan2 = 0x00eeee,
+    Cyan3 = 0x00cdcd,
+    Cyan4 = 0x008b8b,
+    DarkSlateGray1 = 0x97ffff,
+    DarkSlateGray2 = 0x8deeee,
+    DarkSlateGray3 = 0x79cdcd,
+    DarkSlateGray4 = 0x528b8b,
+    Aquamarine1 = 0x7fffd4,
+    Aquamarine2 = 0x76eec6,
+    Aquamarine3 = 0x66cdaa,
+    Aquamarine4 = 0x458b74,
+    DarkSeaGreen1 = 0xc1ffc1,
+    DarkSeaGreen2 = 0xb4eeb4,
+    DarkSeaGreen3 = 0x9bcd9b,
+    DarkSeaGreen4 = 0x698b69,
+    SeaGreen1 = 0x54ff9f,
+    SeaGreen2 = 0x4eee94,
+    SeaGreen3 = 0x43cd80,
+    SeaGreen4 = 0x2e8b57,
+    PaleGreen1 = 0x9aff9a,
+    PaleGreen2 = 0x90ee90,
+    PaleGreen3 = 0x7ccd7c,
+    PaleGreen4 = 0x548b54,
+    SpringGreen1 = 0x00ff7f,
+    SpringGreen2 = 0x00ee76,
+    SpringGreen3 = 0x00cd66,
+    SpringGreen4 = 0x008b45,
+    Green1 = 0x00ff00,
+    Green2 = 0x00ee00,
+    Green3 = 0x00cd00,
+    Green4 = 0x008b00,
+    Chartreuse1 = 0x7fff00,
+    Chartreuse2 = 0x76ee00,
+    Chartreuse3 = 0x66cd00,
+    Chartreuse4 = 0x458b00,
+    OliveDrab1 = 0xc0ff3e,
+    OliveDrab2 = 0xb3ee3a,
+    OliveDrab3 = 0x9acd32,
+    OliveDrab4 = 0x698b22,
+    DarkOliveGreen1 = 0xcaff70,
+    DarkOliveGreen2 = 0xbcee68,
+    DarkOliveGreen3 = 0xa2cd5a,
+    DarkOliveGreen4 = 0x6e8b3d,
+    Khaki1 = 0xfff68f,
+    Khaki2 = 0xeee685,
+    Khaki3 = 0xcdc673,
+    Khaki4 = 0x8b864e,
+    LightGoldenrod1 = 0xffec8b,
+    LightGoldenrod2 = 0xeedc82,
+    LightGoldenrod3 = 0xcdbe70,
+    LightGoldenrod4 = 0x8b814c,
+    LightYellow1 = 0xffffe0,
+    LightYellow2 = 0xeeeed1,
+    LightYellow3 = 0xcdcdb4,
+    LightYellow4 = 0x8b8b7a,
+    Yellow1 = 0xffff00,
+    Yellow2 = 0xeeee00,
+    Yellow3 = 0xcdcd00,
+    Yellow4 = 0x8b8b00,
+    Gold1 = 0xffd700,
+    Gold2 = 0xeec900,
+    Gold3 = 0xcdad00,
+    Gold4 = 0x8b7500,
+    Goldenrod1 = 0xffc125,
+    Goldenrod2 = 0xeeb422,
+    Goldenrod3 = 0xcd9b1d,
+    Goldenrod4 = 0x8b6914,
+    DarkGoldenrod1 = 0xffb90f,
+    DarkGoldenrod2 = 0xeead0e,
+    DarkGoldenrod3 = 0xcd950c,
+    DarkGoldenrod4 = 0x8b6508,
+    RosyBrown1 = 0xffc1c1,
+    RosyBrown2 = 0xeeb4b4,
+    RosyBrown3 = 0xcd9b9b,
+    RosyBrown4 = 0x8b6969,
+    IndianRed1 = 0xff6a6a,
+    IndianRed2 = 0xee6363,
+    IndianRed3 = 0xcd5555,
+    IndianRed4 = 0x8b3a3a,
+    Sienna1 = 0xff8247,
+    Sienna2 = 0xee7942,
+    Sienna3 = 0xcd6839,
+    Sienna4 = 0x8b4726,
+    Burlywood1 = 0xffd39b,
+    Burlywood2 = 0xeec591,
+    Burlywood3 = 0xcdaa7d,
+    Burlywood4 = 0x8b7355,
+    Wheat1 = 0xffe7ba,
+    Wheat2 = 0xeed8ae,
+    Wheat3 = 0xcdba96,
+    Wheat4 = 0x8b7e66,
+    Tan1 = 0xffa54f,
+    Tan2 = 0xee9a49,
+    Tan3 = 0xcd853f,
+    Tan4 = 0x8b5a2b,
+    Chocolate1 = 0xff7f24,
+    Chocolate2 = 0xee7621,
+    Chocolate3 = 0xcd661d,
+    Chocolate4 = 0x8b4513,
+    Firebrick1 = 0xff3030,
+    Firebrick2 = 0xee2c2c,
+    Firebrick3 = 0xcd2626,
+    Firebrick4 = 0x8b1a1a,
+    Brown1 = 0xff4040,
+    Brown2 = 0xee3b3b,
+    Brown3 = 0xcd3333,
+    Brown4 = 0x8b2323,
+    Salmon1 = 0xff8c69,
+    Salmon2 = 0xee8262,
+    Salmon3 = 0xcd7054,
+    Salmon4 = 0x8b4c39,
+    LightSalmon1 = 0xffa07a,
+    LightSalmon2 = 0xee9572,
+    LightSalmon3 = 0xcd8162,
+    LightSalmon4 = 0x8b5742,
+    Orange1 = 0xffa500,
+    Orange2 = 0xee9a00,
+    Orange3 = 0xcd8500,
+    Orange4 = 0x8b5a00,
+    DarkOrange1 = 0xff7f00,
+    DarkOrange2 = 0xee7600,
+    DarkOrange3 = 0xcd6600,
+    DarkOrange4 = 0x8b4500,
+    Coral1 = 0xff7256,
+    Coral2 = 0xee6a50,
+    Coral3 = 0xcd5b45,
+    Coral4 = 0x8b3e2f,
+    Tomato1 = 0xff6347,
+    Tomato2 = 0xee5c42,
+    Tomato3 = 0xcd4f39,
+    Tomato4 = 0x8b3626,
+    OrangeRed1 = 0xff4500,
+    OrangeRed2 = 0xee4000,
+    OrangeRed3 = 0xcd3700,
+    OrangeRed4 = 0x8b2500,
+    Red1 = 0xff0000,
+    Red2 = 0xee0000,
+    Red3 = 0xcd0000,
+    Red4 = 0x8b0000,
+    DeepPink1 = 0xff1493,
+    DeepPink2 = 0xee1289,
+    DeepPink3 = 0xcd1076,
+    DeepPink4 = 0x8b0a50,
+    HotPink1 = 0xff6eb4,
+    HotPink2 = 0xee6aa7,
+    HotPink3 = 0xcd6090,
+    HotPink4 = 0x8b3a62,
+    Pink1 = 0xffb5c5,
+    Pink2 = 0xeea9b8,
+    Pink3 = 0xcd919e,
+    Pink4 = 0x8b636c,
+    LightPink1 = 0xffaeb9,
+    LightPink2 = 0xeea2ad,
+    LightPink3 = 0xcd8c95,
+    LightPink4 = 0x8b5f65,
+    PaleVioletRed1 = 0xff82ab,
+    PaleVioletRed2 = 0xee799f,
+    PaleVioletRed3 = 0xcd6889,
+    PaleVioletRed4 = 0x8b475d,
+    Maroon1 = 0xff34b3,
+    Maroon2 = 0xee30a7,
+    Maroon3 = 0xcd2990,
+    Maroon4 = 0x8b1c62,
+    VioletRed1 = 0xff3e96,
+    VioletRed2 = 0xee3a8c,
+    VioletRed3 = 0xcd3278,
+    VioletRed4 = 0x8b2252,
+    Magenta1 = 0xff00ff,
+    Magenta2 = 0xee00ee,
+    Magenta3 = 0xcd00cd,
+    Magenta4 = 0x8b008b,
+    Orchid1 = 0xff83fa,
+    Orchid2 = 0xee7ae9,
+    Orchid3 = 0xcd69c9,
+    Orchid4 = 0x8b4789,
+    Plum1 = 0xffbbff,
+    Plum2 = 0xeeaeee,
+    Plum3 = 0xcd96cd,
+    Plum4 = 0x8b668b,
+    MediumOrchid1 = 0xe066ff,
+    MediumOrchid2 = 0xd15fee,
+    MediumOrchid3 = 0xb452cd,
+    MediumOrchid4 = 0x7a378b,
+    DarkOrchid1 = 0xbf3eff,
+    DarkOrchid2 = 0xb23aee,
+    DarkOrchid3 = 0x9a32cd,
+    DarkOrchid4 = 0x68228b,
+    Purple1 = 0x9b30ff,
+    Purple2 = 0x912cee,
+    Purple3 = 0x7d26cd,
+    Purple4 = 0x551a8b,
+    MediumPurple1 = 0xab82ff,
+    MediumPurple2 = 0x9f79ee,
+    MediumPurple3 = 0x8968cd,
+    MediumPurple4 = 0x5d478b,
+    Thistle1 = 0xffe1ff,
+    Thistle2 = 0xeed2ee,
+    Thistle3 = 0xcdb5cd,
+    Thistle4 = 0x8b7b8b,
+    Gray0 = 0x000000,
+    Grey0 = 0x000000,
+    Gray1 = 0x030303,
+    Grey1 = 0x030303,
+    Gray2 = 0x050505,
+    Grey2 = 0x050505,
+    Gray3 = 0x080808,
+    Grey3 = 0x080808,
+    Gray4 = 0x0a0a0a,
+    Grey4 = 0x0a0a0a,
+    Gray5 = 0x0d0d0d,
+    Grey5 = 0x0d0d0d,
+    Gray6 = 0x0f0f0f,
+    Grey6 = 0x0f0f0f,
+    Gray7 = 0x121212,
+    Grey7 = 0x121212,
+    Gray8 = 0x141414,
+    Grey8 = 0x141414,
+    Gray9 = 0x171717,
+    Grey9 = 0x171717,
+    Gray10 = 0x1a1a1a,
+    Grey10 = 0x1a1a1a,
+    Gray11 = 0x1c1c1c,
+    Grey11 = 0x1c1c1c,
+    Gray12 = 0x1f1f1f,
+    Grey12 = 0x1f1f1f,
+    Gray13 = 0x212121,
+    Grey13 = 0x212121,
+    Gray14 = 0x242424,
+    Grey14 = 0x242424,
+    Gray15 = 0x262626,
+    Grey15 = 0x262626,
+    Gray16 = 0x292929,
+    Grey16 = 0x292929,
+    Gray17 = 0x2b2b2b,
+    Grey17 = 0x2b2b2b,
+    Gray18 = 0x2e2e2e,
+    Grey18 = 0x2e2e2e,
+    Gray19 = 0x303030,
+    Grey19 = 0x303030,
+    Gray20 = 0x333333,
+    Grey20 = 0x333333,
+    Gray21 = 0x363636,
+    Grey21 = 0x363636,
+    Gray22 = 0x383838,
+    Grey22 = 0x383838,
+    Gray23 = 0x3b3b3b,
+    Grey23 = 0x3b3b3b,
+    Gray24 = 0x3d3d3d,
+    Grey24 = 0x3d3d3d,
+    Gray25 = 0x404040,
+    Grey25 = 0x404040,
+    Gray26 = 0x424242,
+    Grey26 = 0x424242,
+    Gray27 = 0x454545,
+    Grey27 = 0x454545,
+    Gray28 = 0x474747,
+    Grey28 = 0x474747,
+    Gray29 = 0x4a4a4a,
+    Grey29 = 0x4a4a4a,
+    Gray30 = 0x4d4d4d,
+    Grey30 = 0x4d4d4d,
+    Gray31 = 0x4f4f4f,
+    Grey31 = 0x4f4f4f,
+    Gray32 = 0x525252,
+    Grey32 = 0x525252,
+    Gray33 = 0x545454,
+    Grey33 = 0x545454,
+    Gray34 = 0x575757,
+    Grey34 = 0x575757,
+    Gray35 = 0x595959,
+    Grey35 = 0x595959,
+    Gray36 = 0x5c5c5c,
+    Grey36 = 0x5c5c5c,
+    Gray37 = 0x5e5e5e,
+    Grey37 = 0x5e5e5e,
+    Gray38 = 0x616161,
+    Grey38 = 0x616161,
+    Gray39 = 0x636363,
+    Grey39 = 0x636363,
+    Gray40 = 0x666666,
+    Grey40 = 0x666666,
+    Gray41 = 0x696969,
+    Grey41 = 0x696969,
+    Gray42 = 0x6b6b6b,
+    Grey42 = 0x6b6b6b,
+    Gray43 = 0x6e6e6e,
+    Grey43 = 0x6e6e6e,
+    Gray44 = 0x707070,
+    Grey44 = 0x707070,
+    Gray45 = 0x737373,
+    Grey45 = 0x737373,
+    Gray46 = 0x757575,
+    Grey46 = 0x757575,
+    Gray47 = 0x787878,
+    Grey47 = 0x787878,
+    Gray48 = 0x7a7a7a,
+    Grey48 = 0x7a7a7a,
+    Gray49 = 0x7d7d7d,
+    Grey49 = 0x7d7d7d,
+    Gray50 = 0x7f7f7f,
+    Grey50 = 0x7f7f7f,
+    Gray51 = 0x828282,
+    Grey51 = 0x828282,
+    Gray52 = 0x858585,
+    Grey52 = 0x858585,
+    Gray53 = 0x878787,
+    Grey53 = 0x878787,
+    Gray54 = 0x8a8a8a,
+    Grey54 = 0x8a8a8a,
+    Gray55 = 0x8c8c8c,
+    Grey55 = 0x8c8c8c,
+    Gray56 = 0x8f8f8f,
+    Grey56 = 0x8f8f8f,
+    Gray57 = 0x919191,
+    Grey57 = 0x919191,
+    Gray58 = 0x949494,
+    Grey58 = 0x949494,
+    Gray59 = 0x969696,
+    Grey59 = 0x969696,
+    Gray60 = 0x999999,
+    Grey60 = 0x999999,
+    Gray61 = 0x9c9c9c,
+    Grey61 = 0x9c9c9c,
+    Gray62 = 0x9e9e9e,
+    Grey62 = 0x9e9e9e,
+    Gray63 = 0xa1a1a1,
+    Grey63 = 0xa1a1a1,
+    Gray64 = 0xa3a3a3,
+    Grey64 = 0xa3a3a3,
+    Gray65 = 0xa6a6a6,
+    Grey65 = 0xa6a6a6,
+    Gray66 = 0xa8a8a8,
+    Grey66 = 0xa8a8a8,
+    Gray67 = 0xababab,
+    Grey67 = 0xababab,
+    Gray68 = 0xadadad,
+    Grey68 = 0xadadad,
+    Gray69 = 0xb0b0b0,
+    Grey69 = 0xb0b0b0,
+    Gray70 = 0xb3b3b3,
+    Grey70 = 0xb3b3b3,
+    Gray71 = 0xb5b5b5,
+    Grey71 = 0xb5b5b5,
+    Gray72 = 0xb8b8b8,
+    Grey72 = 0xb8b8b8,
+    Gray73 = 0xbababa,
+    Grey73 = 0xbababa,
+    Gray74 = 0xbdbdbd,
+    Grey74 = 0xbdbdbd,
+    Gray75 = 0xbfbfbf,
+    Grey75 = 0xbfbfbf,
+    Gray76 = 0xc2c2c2,
+    Grey76 = 0xc2c2c2,
+    Gray77 = 0xc4c4c4,
+    Grey77 = 0xc4c4c4,
+    Gray78 = 0xc7c7c7,
+    Grey78 = 0xc7c7c7,
+    Gray79 = 0xc9c9c9,
+    Grey79 = 0xc9c9c9,
+    Gray80 = 0xcccccc,
+    Grey80 = 0xcccccc,
+    Gray81 = 0xcfcfcf,
+    Grey81 = 0xcfcfcf,
+    Gray82 = 0xd1d1d1,
+    Grey82 = 0xd1d1d1,
+    Gray83 = 0xd4d4d4,
+    Grey83 = 0xd4d4d4,
+    Gray84 = 0xd6d6d6,
+    Grey84 = 0xd6d6d6,
+    Gray85 = 0xd9d9d9,
+    Grey85 = 0xd9d9d9,
+    Gray86 = 0xdbdbdb,
+    Grey86 = 0xdbdbdb,
+    Gray87 = 0xdedede,
+    Grey87 = 0xdedede,
+    Gray88 = 0xe0e0e0,
+    Grey88 = 0xe0e0e0,
+    Gray89 = 0xe3e3e3,
+    Grey89 = 0xe3e3e3,
+    Gray90 = 0xe5e5e5,
+    Grey90 = 0xe5e5e5,
+    Gray91 = 0xe8e8e8,
+    Grey91 = 0xe8e8e8,
+    Gray92 = 0xebebeb,
+    Grey92 = 0xebebeb,
+    Gray93 = 0xededed,
+    Grey93 = 0xededed,
+    Gray94 = 0xf0f0f0,
+    Grey94 = 0xf0f0f0,
+    Gray95 = 0xf2f2f2,
+    Grey95 = 0xf2f2f2,
+    Gray96 = 0xf5f5f5,
+    Grey96 = 0xf5f5f5,
+    Gray97 = 0xf7f7f7,
+    Grey97 = 0xf7f7f7,
+    Gray98 = 0xfafafa,
+    Grey98 = 0xfafafa,
+    Gray99 = 0xfcfcfc,
+    Grey99 = 0xfcfcfc,
+    Gray100 = 0xffffff,
+    Grey100 = 0xffffff,
+    DarkGrey = 0xa9a9a9,
+    DarkGray = 0xa9a9a9,
+    DarkBlue = 0x00008b,
+    DarkCyan = 0x008b8b,
+    DarkMagenta = 0x8b008b,
+    DarkRed = 0x8b0000,
+    LightGreen = 0x90ee90,
+    Crimson = 0xdc143c,
+    Indigo = 0x4b0082,
+    Olive = 0x808000,
+    RebeccaPurple = 0x663399,
+    Silver = 0xc0c0c0,
+    Teal = 0x008080,
+};
+};
+}
+
+#endif
diff --git a/libs/tracy/common/TracyForceInline.hpp b/libs/tracy/common/TracyForceInline.hpp
@@ -0,0 +1,20 @@
+#ifndef __TRACYFORCEINLINE_HPP__
+#define __TRACYFORCEINLINE_HPP__
+
+#if defined(__GNUC__)
+#  define tracy_force_inline __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#  define tracy_force_inline __forceinline
+#else
+#  define tracy_force_inline inline
+#endif
+
+#if defined(__GNUC__)
+#  define tracy_no_inline __attribute__((noinline))
+#elif defined(_MSC_VER)
+#  define tracy_no_inline __declspec(noinline)
+#else
+#  define tracy_no_inline
+#endif
+
+#endif
diff --git a/libs/tracy/common/TracyMutex.hpp b/libs/tracy/common/TracyMutex.hpp
@@ -0,0 +1,33 @@
+#ifndef __TRACYMUTEX_HPP__
+#define __TRACYMUTEX_HPP__
+
+#if defined _MSC_VER
+
+#  include <shared_mutex>
+
+namespace tracy
+{
+using TracyMutex = std::shared_mutex;
+}
+
+#elif defined __CYGWIN__
+
+#include "tracy_benaphore.h"
+
+namespace tracy
+{
+using TracyMutex = NonRecursiveBenaphore;
+}
+
+#else
+
+#include <mutex>
+
+namespace tracy
+{
+using TracyMutex = std::mutex;
+}
+
+#endif
+
+#endif
diff --git a/libs/tracy/common/TracyProtocol.hpp b/libs/tracy/common/TracyProtocol.hpp
@@ -0,0 +1,103 @@
+#ifndef __TRACYPROTOCOL_HPP__
+#define __TRACYPROTOCOL_HPP__
+
+#include <limits>
+#include <stdint.h>
+
+namespace tracy
+{
+
+constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
+
+enum : uint32_t { ProtocolVersion = 24 };
+enum : uint32_t { BroadcastVersion = 0 };
+
+using lz4sz_t = uint32_t;
+
+enum { TargetFrameSize = 256 * 1024 };
+enum { LZ4Size = Lz4CompressBound( TargetFrameSize ) };
+static_assert( LZ4Size <= std::numeric_limits<lz4sz_t>::max(), "LZ4Size greater than lz4sz_t" );
+static_assert( TargetFrameSize * 2 >= 64 * 1024, "Not enough space for LZ4 stream buffer" );
+
+enum { HandshakeShibbolethSize = 8 };
+static const char HandshakeShibboleth[HandshakeShibbolethSize] = { 'T', 'r', 'a', 'c', 'y', 'P', 'r', 'f' };
+
+enum HandshakeStatus : uint8_t
+{
+    HandshakePending,
+    HandshakeWelcome,
+    HandshakeProtocolMismatch,
+    HandshakeNotAvailable,
+    HandshakeDropped
+};
+
+enum { WelcomeMessageProgramNameSize = 64 };
+enum { WelcomeMessageHostInfoSize = 1024 };
+
+#pragma pack( 1 )
+
+enum ServerQuery : uint8_t
+{
+    ServerQueryTerminate,
+    ServerQueryString,
+    ServerQueryThreadString,
+    ServerQuerySourceLocation,
+    ServerQueryPlotName,
+    ServerQueryCallstackFrame,
+    ServerQueryFrameName,
+    ServerQueryDisconnect,
+    ServerQueryExternalName,
+    ServerQueryParameter
+};
+
+struct ServerQueryPacket
+{
+    ServerQuery type;
+    uint64_t ptr;
+};
+
+enum { ServerQueryPacketSize = sizeof( ServerQueryPacket ) };
+
+
+struct WelcomeMessage
+{
+    double timerMul;
+    int64_t initBegin;
+    int64_t initEnd;
+    uint64_t delay;
+    uint64_t resolution;
+    uint64_t epoch;
+    uint64_t pid;
+    uint8_t onDemand;
+    uint8_t isApple;
+    char programName[WelcomeMessageProgramNameSize];
+    char hostInfo[WelcomeMessageHostInfoSize];
+};
+
+enum { WelcomeMessageSize = sizeof( WelcomeMessage ) };
+
+
+struct OnDemandPayloadMessage
+{
+    uint64_t frames;
+    uint64_t currentTime;
+};
+
+enum { OnDemandPayloadMessageSize = sizeof( OnDemandPayloadMessage ) };
+
+
+struct BroadcastMessage
+{
+    uint32_t broadcastVersion;
+    uint32_t protocolVersion;
+    uint32_t activeTime;        // in seconds
+    char programName[WelcomeMessageProgramNameSize];
+};
+
+enum { BroadcastMessageSize = sizeof( BroadcastMessage ) };
+
+#pragma pack()
+
+}
+
+#endif
diff --git a/libs/tracy/common/TracyQueue.hpp b/libs/tracy/common/TracyQueue.hpp
@@ -0,0 +1,500 @@
+#ifndef __TRACYQUEUE_HPP__
+#define __TRACYQUEUE_HPP__
+
+#include <stdint.h>
+
+namespace tracy
+{
+
+enum class QueueType : uint8_t
+{
+    ZoneText,
+    ZoneName,
+    Message,
+    MessageColor,
+    MessageCallstack,
+    MessageColorCallstack,
+    MessageAppInfo,
+    ZoneBeginAllocSrcLoc,
+    ZoneBeginAllocSrcLocCallstack,
+    CallstackMemory,
+    Callstack,
+    CallstackAlloc,
+    FrameImage,
+    ZoneBegin,
+    ZoneBeginCallstack,
+    ZoneEnd,
+    LockWait,
+    LockObtain,
+    LockRelease,
+    LockSharedWait,
+    LockSharedObtain,
+    LockSharedRelease,
+    MemAlloc,
+    MemFree,
+    MemAllocCallstack,
+    MemFreeCallstack,
+    GpuZoneBegin,
+    GpuZoneBeginCallstack,
+    GpuZoneEnd,
+    GpuZoneBeginSerial,
+    GpuZoneBeginCallstackSerial,
+    GpuZoneEndSerial,
+    PlotData,
+    ContextSwitch,
+    ThreadWakeup,
+    GpuTime,
+    Terminate,
+    KeepAlive,
+    ThreadContext,
+    Crash,
+    CrashReport,
+    ZoneValidation,
+    FrameMarkMsg,
+    FrameMarkMsgStart,
+    FrameMarkMsgEnd,
+    SourceLocation,
+    LockAnnounce,
+    LockTerminate,
+    LockMark,
+    MessageLiteral,
+    MessageLiteralColor,
+    MessageLiteralCallstack,
+    MessageLiteralColorCallstack,
+    GpuNewContext,
+    CallstackFrameSize,
+    CallstackFrame,
+    SysTimeReport,
+    TidToPid,
+    PlotConfig,
+    ParamSetup,
+    StringData,
+    ThreadName,
+    CustomStringData,
+    PlotName,
+    SourceLocationPayload,
+    CallstackPayload,
+    CallstackAllocPayload,
+    FrameName,
+    FrameImageData,
+    ExternalName,
+    ExternalThreadName,
+    NUM_TYPES
+};
+
+#pragma pack( 1 )
+
+struct QueueThreadContext
+{
+    uint64_t thread;
+};
+
+struct QueueZoneBegin
+{
+    int64_t time;
+    uint64_t srcloc;    // ptr
+};
+
+struct QueueZoneEnd
+{
+    int64_t time;
+};
+
+struct QueueZoneValidation
+{
+    uint32_t id;
+};
+
+struct QueueStringTransfer
+{
+    uint64_t ptr;
+};
+
+struct QueueFrameMark
+{
+    int64_t time;
+    uint64_t name;      // ptr
+};
+
+struct QueueFrameImage
+{
+    uint64_t image;     // ptr
+    uint64_t frame;
+    uint16_t w;
+    uint16_t h;
+    uint8_t flip;
+};
+
+struct QueueSourceLocation
+{
+    uint64_t name;
+    uint64_t function;  // ptr
+    uint64_t file;      // ptr
+    uint32_t line;
+    uint8_t r;
+    uint8_t g;
+    uint8_t b;
+};
+
+struct QueueZoneText
+{
+    uint64_t text;      // ptr
+};
+
+enum class LockType : uint8_t
+{
+    Lockable,
+    SharedLockable
+};
+
+struct QueueLockAnnounce
+{
+    uint32_t id;
+    int64_t time;
+    uint64_t lckloc;    // ptr
+    LockType type;
+};
+
+struct QueueLockTerminate
+{
+    uint32_t id;
+    int64_t time;
+    LockType type;
+};
+
+struct QueueLockWait
+{
+    uint64_t thread;
+    uint32_t id;
+    int64_t time;
+    LockType type;
+};
+
+struct QueueLockObtain
+{
+    uint64_t thread;
+    uint32_t id;
+    int64_t time;
+};
+
+struct QueueLockRelease
+{
+    uint64_t thread;
+    uint32_t id;
+    int64_t time;
+};
+
+struct QueueLockMark
+{
+    uint64_t thread;
+    uint32_t id;
+    uint64_t srcloc;    // ptr
+};
+
+enum class PlotDataType : uint8_t
+{
+    Float,
+    Double,
+    Int
+};
+
+struct QueuePlotData
+{
+    uint64_t name;      // ptr
+    int64_t time;
+    PlotDataType type;
+    union
+    {
+        double d;
+        float f;
+        int64_t i;
+    } data;
+};
+
+struct QueueMessage
+{
+    int64_t time;
+    uint64_t text;      // ptr
+};
+
+struct QueueMessageColor : public QueueMessage
+{
+    uint8_t r;
+    uint8_t g;
+    uint8_t b;
+};
+
+struct QueueGpuNewContext
+{
+    int64_t cpuTime;
+    int64_t gpuTime;
+    uint64_t thread;
+    float period;
+    uint8_t context;
+    uint8_t accuracyBits;
+};
+
+struct QueueGpuZoneBegin
+{
+    int64_t cpuTime;
+    uint64_t srcloc;
+    uint64_t thread;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct QueueGpuZoneEnd
+{
+    int64_t cpuTime;
+    uint64_t thread;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct QueueGpuTime
+{
+    int64_t gpuTime;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct QueueMemAlloc
+{
+    int64_t time;
+    uint64_t thread;
+    uint64_t ptr;
+    char size[6];
+};
+
+struct QueueMemFree
+{
+    int64_t time;
+    uint64_t thread;
+    uint64_t ptr;
+};
+
+struct QueueCallstackMemory
+{
+    uint64_t ptr;
+};
+
+struct QueueCallstack
+{
+    uint64_t ptr;
+};
+
+struct QueueCallstackAlloc
+{
+    uint64_t ptr;
+    uint64_t nativePtr;
+};
+
+struct QueueCallstackFrameSize
+{
+    uint64_t ptr;
+    uint8_t size;
+};
+
+struct QueueCallstackFrame
+{
+    uint64_t name;
+    uint64_t file;
+    uint32_t line;
+};
+
+struct QueueCrashReport
+{
+    int64_t time;
+    uint64_t text;      // ptr
+};
+
+struct QueueSysTime
+{
+    int64_t time;
+    float sysTime;
+};
+
+struct QueueContextSwitch
+{
+    int64_t time;
+    uint64_t oldThread;
+    uint64_t newThread;
+    uint8_t cpu;
+    uint8_t reason;
+    uint8_t state;
+};
+
+struct QueueThreadWakeup
+{
+    int64_t time;
+    uint64_t thread;
+};
+
+struct QueueTidToPid
+{
+    uint64_t tid;
+    uint64_t pid;
+};
+
+enum class PlotFormatType : uint8_t
+{
+    Number,
+    Memory,
+    Percentage
+};
+
+struct QueuePlotConfig
+{
+    uint64_t name;      // ptr
+    uint8_t type;
+};
+
+struct QueueParamSetup
+{
+    uint32_t idx;
+    uint64_t name;      // ptr
+    uint8_t isBool;
+    int32_t val;
+};
+
+struct QueueHeader
+{
+    union
+    {
+        QueueType type;
+        uint8_t idx;
+    };
+};
+
+struct QueueItem
+{
+    QueueHeader hdr;
+    union
+    {
+        QueueThreadContext threadCtx;
+        QueueZoneBegin zoneBegin;
+        QueueZoneEnd zoneEnd;
+        QueueZoneValidation zoneValidation;
+        QueueStringTransfer stringTransfer;
+        QueueFrameMark frameMark;
+        QueueFrameImage frameImage;
+        QueueSourceLocation srcloc;
+        QueueZoneText zoneText;
+        QueueLockAnnounce lockAnnounce;
+        QueueLockTerminate lockTerminate;
+        QueueLockWait lockWait;
+        QueueLockObtain lockObtain;
+        QueueLockRelease lockRelease;
+        QueueLockMark lockMark;
+        QueuePlotData plotData;
+        QueueMessage message;
+        QueueMessageColor messageColor;
+        QueueGpuNewContext gpuNewContext;
+        QueueGpuZoneBegin gpuZoneBegin;
+        QueueGpuZoneEnd gpuZoneEnd;
+        QueueGpuTime gpuTime;
+        QueueMemAlloc memAlloc;
+        QueueMemFree memFree;
+        QueueCallstackMemory callstackMemory;
+        QueueCallstack callstack;
+        QueueCallstackAlloc callstackAlloc;
+        QueueCallstackFrameSize callstackFrameSize;
+        QueueCallstackFrame callstackFrame;
+        QueueCrashReport crashReport;
+        QueueSysTime sysTime;
+        QueueContextSwitch contextSwitch;
+        QueueThreadWakeup threadWakeup;
+        QueueTidToPid tidToPid;
+        QueuePlotConfig plotConfig;
+        QueueParamSetup paramSetup;
+    };
+};
+#pragma pack()
+
+
+enum { QueueItemSize = sizeof( QueueItem ) };
+
+static const size_t QueueDataSize[] = {
+    sizeof( QueueHeader ) + sizeof( QueueZoneText ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneText ),        // zone name
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),
+    sizeof( QueueHeader ) + sizeof( QueueMessageColor ),
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),         // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMessageColor ),    // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),         // app info
+    sizeof( QueueHeader ) + sizeof( QueueZoneBegin ),       // allocated source location
+    sizeof( QueueHeader ) + sizeof( QueueZoneBegin ),       // allocated source location, callstack
+    sizeof( QueueHeader ) + sizeof( QueueCallstackMemory ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstack ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstackAlloc ),
+    sizeof( QueueHeader ) + sizeof( QueueFrameImage ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneBegin ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneBegin ),       // callstack
+    sizeof( QueueHeader ) + sizeof( QueueZoneEnd ),
+    sizeof( QueueHeader ) + sizeof( QueueLockWait ),
+    sizeof( QueueHeader ) + sizeof( QueueLockObtain ),
+    sizeof( QueueHeader ) + sizeof( QueueLockRelease ),
+    sizeof( QueueHeader ) + sizeof( QueueLockWait ),        // shared
+    sizeof( QueueHeader ) + sizeof( QueueLockObtain ),      // shared
+    sizeof( QueueHeader ) + sizeof( QueueLockRelease ),     // shared
+    sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),
+    sizeof( QueueHeader ) + sizeof( QueueMemFree ),
+    sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),        // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMemFree ),         // callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),    // callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),    // serial
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),    // serial, callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ),      // serial
+    sizeof( QueueHeader ) + sizeof( QueuePlotData ),
+    sizeof( QueueHeader ) + sizeof( QueueContextSwitch ),
+    sizeof( QueueHeader ) + sizeof( QueueThreadWakeup ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuTime ),
+    // above items must be first
+    sizeof( QueueHeader ),                                  // terminate
+    sizeof( QueueHeader ),                                  // keep alive
+    sizeof( QueueHeader ) + sizeof( QueueThreadContext ),
+    sizeof( QueueHeader ),                                  // crash
+    sizeof( QueueHeader ) + sizeof( QueueCrashReport ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneValidation ),
+    sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // continuous frames
+    sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // start
+    sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // end
+    sizeof( QueueHeader ) + sizeof( QueueSourceLocation ),
+    sizeof( QueueHeader ) + sizeof( QueueLockAnnounce ),
+    sizeof( QueueHeader ) + sizeof( QueueLockTerminate ),
+    sizeof( QueueHeader ) + sizeof( QueueLockMark ),
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),         // literal
+    sizeof( QueueHeader ) + sizeof( QueueMessageColor ),    // literal
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),         // literal, callstack
+    sizeof( QueueHeader ) + sizeof( QueueMessageColor ),    // literal, callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ),
+    sizeof( QueueHeader ) + sizeof( QueueSysTime ),
+    sizeof( QueueHeader ) + sizeof( QueueTidToPid ),
+    sizeof( QueueHeader ) + sizeof( QueuePlotConfig ),
+    sizeof( QueueHeader ) + sizeof( QueueParamSetup ),
+    // keep all QueueStringTransfer below
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // string data
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // thread name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // custom string data
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // plot name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // allocated source location payload
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // callstack payload
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // callstack alloc payload
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // frame name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // frame image data
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // external name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // external thread name
+};
+
+static_assert( QueueItemSize == 32, "Queue item size not 32 bytes" );
+static_assert( sizeof( QueueDataSize ) / sizeof( size_t ) == (uint8_t)QueueType::NUM_TYPES, "QueueDataSize mismatch" );
+static_assert( sizeof( void* ) <= sizeof( uint64_t ), "Pointer size > 8 bytes" );
+static_assert( sizeof( void* ) == sizeof( uintptr_t ), "Pointer size != uintptr_t" );
+
+};
+
+#endif
diff --git a/libs/tracy/common/TracySocket.cpp b/libs/tracy/common/TracySocket.cpp
@@ -0,0 +1,561 @@
+#include <assert.h>
+#include <new>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "TracyAlloc.hpp"
+#include "TracySocket.hpp"
+
+#ifdef _WIN32
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <winsock2.h>
+#  include <ws2tcpip.h>
+#  ifdef _MSC_VER
+#    pragma warning(disable:4244)
+#    pragma warning(disable:4267)
+#  endif
+#  define poll WSAPoll
+#else
+#  include <arpa/inet.h>
+#  include <sys/socket.h>
+#  include <sys/param.h>
+#  include <netinet/in.h>
+#  include <netdb.h>
+#  include <unistd.h>
+#  include <poll.h>
+#endif
+
+#ifndef MSG_NOSIGNAL
+#  define MSG_NOSIGNAL 0
+#endif
+
+namespace tracy
+{
+
+#ifdef _WIN32
+typedef SOCKET socket_t;
+#else
+typedef int socket_t;
+#endif
+
+#ifdef _WIN32
+struct __wsinit
+{
+    __wsinit()
+    {
+        WSADATA wsaData;
+        if( WSAStartup( MAKEWORD( 2, 2 ), &wsaData ) != 0 )
+        {
+            fprintf( stderr, "Cannot init winsock.\n" );
+            exit( 1 );
+        }
+    }
+};
+
+void InitWinSock()
+{
+    static __wsinit init;
+}
+#endif
+
+Socket::Socket()
+    : m_buf( (char*)tracy_malloc( BufSize ) )
+    , m_bufPtr( nullptr )
+    , m_sock( -1 )
+    , m_bufLeft( 0 )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+Socket::Socket( int sock )
+    : m_buf( (char*)tracy_malloc( BufSize ) )
+    , m_bufPtr( nullptr )
+    , m_sock( sock )
+    , m_bufLeft( 0 )
+{
+}
+
+Socket::~Socket()
+{
+    tracy_free( m_buf );
+    if( m_sock != -1 )
+    {
+        Close();
+    }
+}
+
+bool Socket::Connect( const char* addr, int port )
+{
+    assert( m_sock == -1 );
+
+    struct addrinfo hints;
+    struct addrinfo *res, *ptr;
+
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    char portbuf[32];
+    sprintf( portbuf, "%i", port );
+
+    if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false;
+    int sock = 0;
+    for( ptr = res; ptr; ptr = ptr->ai_next )
+    {
+        if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue;
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+        if( connect( sock, ptr->ai_addr, ptr->ai_addrlen ) == -1 )
+        {
+#ifdef _WIN32
+            closesocket( sock );
+#else
+            close( sock );
+#endif
+            continue;
+        }
+        break;
+    }
+    freeaddrinfo( res );
+    if( !ptr ) return false;
+
+    m_sock = sock;
+    return true;
+}
+
+void Socket::Close()
+{
+    assert( m_sock != -1 );
+#ifdef _WIN32
+    closesocket( m_sock );
+#else
+    close( m_sock );
+#endif
+    m_sock = -1;
+}
+
+int Socket::Send( const void* _buf, int len )
+{
+    auto buf = (const char*)_buf;
+    assert( m_sock != -1 );
+    auto start = buf;
+    while( len > 0 )
+    {
+        auto ret = send( m_sock, buf, len, MSG_NOSIGNAL );
+        if( ret == -1 ) return -1;
+        len -= ret;
+        buf += ret;
+    }
+    return int( buf - start );
+}
+
+int Socket::GetSendBufSize()
+{
+    int bufSize;
+#if defined _WIN32 || defined __CYGWIN__
+    int sz = sizeof( bufSize );
+    getsockopt( m_sock, SOL_SOCKET, SO_SNDBUF, (char*)&bufSize, &sz );
+#else
+    socklen_t sz = sizeof( bufSize );
+    getsockopt( m_sock, SOL_SOCKET, SO_SNDBUF, &bufSize, &sz );
+#endif
+    return bufSize;
+}
+
+int Socket::RecvBuffered( void* buf, int len, int timeout )
+{
+    if( len <= m_bufLeft )
+    {
+        memcpy( buf, m_bufPtr, len );
+        m_bufPtr += len;
+        m_bufLeft -= len;
+        return len;
+    }
+
+    if( m_bufLeft > 0 )
+    {
+        memcpy( buf, m_bufPtr, m_bufLeft );
+        const auto ret = m_bufLeft;
+        m_bufLeft = 0;
+        return ret;
+    }
+
+    if( len >= BufSize ) return Recv( buf, len, timeout );
+
+    m_bufLeft = Recv( m_buf, BufSize, timeout );
+    if( m_bufLeft <= 0 ) return m_bufLeft;
+
+    const auto sz = len < m_bufLeft ? len : m_bufLeft;
+    memcpy( buf, m_buf, sz );
+    m_bufPtr = m_buf + sz;
+    m_bufLeft -= sz;
+    return sz;
+}
+
+int Socket::Recv( void* _buf, int len, int timeout )
+{
+    auto buf = (char*)_buf;
+
+    struct pollfd fd;
+    fd.fd = (socket_t)m_sock;
+    fd.events = POLLIN;
+
+    if( poll( &fd, 1, timeout ) > 0 )
+    {
+        return recv( m_sock, buf, len, 0 );
+    }
+    else
+    {
+        return -1;
+    }
+}
+
+bool Socket::Read( void* _buf, int len, int timeout, std::function<bool()> exitCb )
+{
+    auto buf = (char*)_buf;
+
+    while( len > 0 )
+    {
+        if( exitCb() ) return false;
+        const auto sz = RecvBuffered( buf, len, timeout );
+        switch( sz )
+        {
+        case 0:
+            return false;
+        case -1:
+#ifdef _WIN32
+        {
+            auto err = WSAGetLastError();
+            if( err == WSAECONNABORTED || err == WSAECONNRESET ) return false;
+        }
+#endif
+            break;
+        default:
+            len -= sz;
+            buf += sz;
+            break;
+        }
+    }
+
+    return true;
+}
+
+bool Socket::ReadRaw( void* _buf, int len, int timeout )
+{
+    auto buf = (char*)_buf;
+    while( len > 0 )
+    {
+        const auto sz = Recv( buf, len, timeout );
+        if( sz <= 0 ) return false;
+        len -= sz;
+        buf += sz;
+    }
+    return true;
+}
+
+bool Socket::HasData()
+{
+    if( m_bufLeft > 0 ) return true;
+
+    struct pollfd fd;
+    fd.fd = (socket_t)m_sock;
+    fd.events = POLLIN;
+
+    return poll( &fd, 1, 0 ) > 0;
+}
+
+
+ListenSocket::ListenSocket()
+    : m_sock( -1 )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+ListenSocket::~ListenSocket()
+{
+    if( m_sock != -1 ) Close();
+}
+
+bool ListenSocket::Listen( int port, int backlog )
+{
+    assert( m_sock == -1 );
+
+    struct addrinfo* res;
+    struct addrinfo hints;
+
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = AF_INET6;
+    hints.ai_socktype = SOCK_STREAM;
+    hints.ai_flags = AI_PASSIVE;
+
+    char portbuf[32];
+    sprintf( portbuf, "%i", port );
+
+    if( getaddrinfo( nullptr, portbuf, &hints, &res ) != 0 ) return false;
+
+    m_sock = socket( res->ai_family, res->ai_socktype, res->ai_protocol );
+#if defined _WIN32 || defined __CYGWIN__
+    unsigned long val = 0;
+    setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) );
+#elif defined BSD
+    int val = 0;
+    setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) );
+    val = 1;
+    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof( val ) );
+#else
+    int val = 1;
+    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof( val ) );
+#endif
+    if( bind( m_sock, res->ai_addr, res->ai_addrlen ) == -1 ) { freeaddrinfo( res ); return false; }
+    if( listen( m_sock, backlog ) == -1 ) { freeaddrinfo( res ); return false; }
+    freeaddrinfo( res );
+    return true;
+}
+
+Socket* ListenSocket::Accept()
+{
+    struct sockaddr_storage remote;
+    socklen_t sz = sizeof( remote );
+
+    struct pollfd fd;
+    fd.fd = (socket_t)m_sock;
+    fd.events = POLLIN;
+
+    if( poll( &fd, 1, 10 ) > 0 )
+    {
+        int sock = accept( m_sock, (sockaddr*)&remote, &sz);
+        if( sock == -1 ) return nullptr;
+
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+
+        auto ptr = (Socket*)tracy_malloc( sizeof( Socket ) );
+        new(ptr) Socket( sock );
+        return ptr;
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+void ListenSocket::Close()
+{
+    assert( m_sock != -1 );
+#ifdef _WIN32
+    closesocket( m_sock );
+#else
+    close( m_sock );
+#endif
+    m_sock = -1;
+}
+
+UdpBroadcast::UdpBroadcast()
+    : m_sock( -1 )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+UdpBroadcast::~UdpBroadcast()
+{
+    if( m_sock != -1 ) Close();
+}
+
+bool UdpBroadcast::Open( const char* addr, int port )
+{
+    assert( m_sock == -1 );
+
+    struct addrinfo hints;
+    struct addrinfo *res, *ptr;
+
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = AF_INET;
+    hints.ai_socktype = SOCK_DGRAM;
+
+    char portbuf[32];
+    sprintf( portbuf, "%i", port );
+
+    if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false;
+    int sock = 0;
+    for( ptr = res; ptr; ptr = ptr->ai_next )
+    {
+        if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue;
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+#if defined _WIN32 || defined __CYGWIN__
+        unsigned long broadcast = 1;
+        if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, (const char*)&broadcast, sizeof( broadcast ) ) == -1 )
+#else
+        int broadcast = 1;
+        if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof( broadcast ) ) == -1 )
+#endif
+        {
+#ifdef _WIN32
+            closesocket( sock );
+#else
+            close( sock );
+#endif
+            continue;
+        }
+        break;
+    }
+    freeaddrinfo( res );
+    if( !ptr ) return false;
+
+    m_sock = sock;
+    return true;
+}
+
+void UdpBroadcast::Close()
+{
+    assert( m_sock != -1 );
+#ifdef _WIN32
+    closesocket( m_sock );
+#else
+    close( m_sock );
+#endif
+    m_sock = -1;
+}
+
+int UdpBroadcast::Send( int port, const void* data, int len )
+{
+    assert( m_sock != -1 );
+    struct sockaddr_in addr;
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons( port );
+    addr.sin_addr.s_addr = INADDR_BROADCAST;
+    return sendto( m_sock, (const char*)data, len, MSG_NOSIGNAL, (sockaddr*)&addr, sizeof( addr ) );
+}
+
+IpAddress::IpAddress()
+    : m_number( 0 )
+{
+    *m_text = '\0';
+}
+
+IpAddress::~IpAddress()
+{
+}
+
+void IpAddress::Set( const struct sockaddr& addr )
+{
+#if __MINGW32__
+    auto ai = (struct sockaddr_in*)&addr;
+#else
+    auto ai = (const struct sockaddr_in*)&addr;
+#endif
+    inet_ntop( AF_INET, &ai->sin_addr, m_text, 17 );
+    m_number = ai->sin_addr.s_addr;
+}
+
+UdpListen::UdpListen()
+    : m_sock( -1 )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+UdpListen::~UdpListen()
+{
+    if( m_sock != -1 ) Close();
+}
+
+bool UdpListen::Listen( int port )
+{
+    assert( m_sock == -1 );
+
+    int sock;
+    if( ( sock = socket( AF_INET, SOCK_DGRAM, 0 ) ) == -1 ) return false;
+
+#if defined __APPLE__
+    int val = 1;
+    setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+#if defined _WIN32 || defined __CYGWIN__
+    unsigned long reuse = 1;
+    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) );
+#else
+    int reuse = 1;
+    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) );
+#endif
+#if defined _WIN32 || defined __CYGWIN__
+    unsigned long broadcast = 1;
+    if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, (const char*)&broadcast, sizeof( broadcast ) ) == -1 )
+#else
+    int broadcast = 1;
+    if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof( broadcast ) ) == -1 )
+#endif
+    {
+#ifdef _WIN32
+        closesocket( sock );
+#else
+        close( sock );
+#endif
+        return false;
+    }
+
+    struct sockaddr_in addr;
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons( port );
+    addr.sin_addr.s_addr = INADDR_ANY;
+
+    if( bind( sock, (sockaddr*)&addr, sizeof( addr ) ) == -1 )
+    {
+#ifdef _WIN32
+        closesocket( sock );
+#else
+        close( sock );
+#endif
+        return false;
+    }
+
+    m_sock = sock;
+    return true;
+}
+
+void UdpListen::Close()
+{
+    assert( m_sock != -1 );
+#ifdef _WIN32
+    closesocket( m_sock );
+#else
+    close( m_sock );
+#endif
+    m_sock = -1;
+}
+
+const char* UdpListen::Read( size_t& len, IpAddress& addr )
+{
+    static char buf[2048];
+
+    struct pollfd fd;
+    fd.fd = (socket_t)m_sock;
+    fd.events = POLLIN;
+    if( poll( &fd, 1, 10 ) <= 0 ) return nullptr;
+
+    sockaddr sa;
+    socklen_t salen = sizeof( struct sockaddr );
+    len = (size_t)recvfrom( m_sock, buf, 2048, 0, &sa, &salen );
+    addr.Set( sa );
+
+    return buf;
+}
+
+}
diff --git a/libs/tracy/common/TracySocket.hpp b/libs/tracy/common/TracySocket.hpp
@@ -0,0 +1,131 @@
+#ifndef __TRACYSOCKET_HPP__
+#define __TRACYSOCKET_HPP__
+
+#include <functional>
+
+struct sockaddr;
+
+namespace tracy
+{
+
+#ifdef _WIN32
+void InitWinSock();
+#endif
+
+class Socket
+{
+    enum { BufSize = 128 * 1024 };
+
+public:
+    Socket();
+    Socket( int sock );
+    ~Socket();
+
+    bool Connect( const char* addr, int port );
+    void Close();
+
+    int Send( const void* buf, int len );
+    int GetSendBufSize();
+
+    bool Read( void* buf, int len, int timeout, std::function<bool()> exitCb );
+    bool ReadRaw( void* buf, int len, int timeout );
+    bool HasData();
+
+    Socket( const Socket& ) = delete;
+    Socket( Socket&& ) = delete;
+    Socket& operator=( const Socket& ) = delete;
+    Socket& operator=( Socket&& ) = delete;
+
+private:
+    int RecvBuffered( void* buf, int len, int timeout );
+    int Recv( void* buf, int len, int timeout );
+
+    char* m_buf;
+    char* m_bufPtr;
+    int m_sock;
+    int m_bufLeft;
+};
+
+class ListenSocket
+{
+public:
+    ListenSocket();
+    ~ListenSocket();
+
+    bool Listen( int port, int backlog );
+    Socket* Accept();
+    void Close();
+
+    ListenSocket( const ListenSocket& ) = delete;
+    ListenSocket( ListenSocket&& ) = delete;
+    ListenSocket& operator=( const ListenSocket& ) = delete;
+    ListenSocket& operator=( ListenSocket&& ) = delete;
+
+private:
+    int m_sock;
+};
+
+class UdpBroadcast
+{
+public:
+    UdpBroadcast();
+    ~UdpBroadcast();
+
+    bool Open( const char* addr, int port );
+    void Close();
+
+    int Send( int port, const void* data, int len );
+
+    UdpBroadcast( const UdpBroadcast& ) = delete;
+    UdpBroadcast( UdpBroadcast&& ) = delete;
+    UdpBroadcast& operator=( const UdpBroadcast& ) = delete;
+    UdpBroadcast& operator=( UdpBroadcast&& ) = delete;
+
+private:
+    int m_sock;
+};
+
+class IpAddress
+{
+public:
+    IpAddress();
+    ~IpAddress();
+
+    void Set( const struct sockaddr& addr );
+
+    uint32_t GetNumber() const { return m_number; }
+    const char* GetText() const { return m_text; }
+
+    IpAddress( const IpAddress& ) = delete;
+    IpAddress( IpAddress&& ) = delete;
+    IpAddress& operator=( const IpAddress& ) = delete;
+    IpAddress& operator=( IpAddress&& ) = delete;
+
+private:
+    uint32_t m_number;
+    char m_text[17];
+};
+
+class UdpListen
+{
+public:
+    UdpListen();
+    ~UdpListen();
+
+    bool Listen( int port );
+    void Close();
+
+    const char* Read( size_t& len, IpAddress& addr );
+
+    UdpListen( const UdpListen& ) = delete;
+    UdpListen( UdpListen&& ) = delete;
+    UdpListen& operator=( const UdpListen& ) = delete;
+    UdpListen& operator=( UdpListen&& ) = delete;
+
+private:
+    int m_sock;
+};
+
+}
+
+#endif
diff --git a/libs/tracy/common/TracySystem.cpp b/libs/tracy/common/TracySystem.cpp
@@ -0,0 +1,187 @@
+#if defined _MSC_VER || defined __CYGWIN__ || defined _WIN32
+# ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+# endif
+# ifndef NOMINMAX
+#  define NOMINMAX
+# endif
+#endif
+#if defined _WIN32 || defined __CYGWIN__
+#  include <windows.h>
+#else
+#  include <pthread.h>
+#  include <string.h>
+#  include <unistd.h>
+#endif
+
+#ifdef __linux__
+#  ifndef __ANDROID__
+#    include <syscall.h>
+#  endif
+#  include <fcntl.h>
+#endif
+
+#ifdef __MINGW32__
+#  define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "TracySystem.hpp"
+
+#ifdef TRACY_ENABLE
+#  include <atomic>
+#  include "TracyAlloc.hpp"
+#endif
+
+namespace tracy
+{
+
+#ifdef TRACY_ENABLE
+struct ThreadNameData
+{
+    uint64_t id;
+    const char* name;
+    ThreadNameData* next;
+};
+TRACY_API std::atomic<ThreadNameData*>& GetThreadNameData();
+TRACY_API void InitRPMallocThread();
+#endif
+
+void SetThreadName( const char* name )
+{
+#if defined _WIN32 || defined __CYGWIN__
+#  if defined NTDDI_WIN10_RS2 && NTDDI_VERSION >= NTDDI_WIN10_RS2
+    wchar_t buf[256];
+    mbstowcs( buf, name, 256 );
+    SetThreadDescription( GetCurrentThread(), buf );
+#  elif defined _MSC_VER
+    const DWORD MS_VC_EXCEPTION=0x406D1388;
+#    pragma pack( push, 8 )
+    struct THREADNAME_INFO
+    {
+        DWORD dwType;
+        LPCSTR szName;
+        DWORD dwThreadID;
+        DWORD dwFlags;
+    };
+#    pragma pack(pop)
+
+    DWORD ThreadId = GetCurrentThreadId();
+    THREADNAME_INFO info;
+    info.dwType = 0x1000;
+    info.szName = name;
+    info.dwThreadID = ThreadId;
+    info.dwFlags = 0;
+
+    __try
+    {
+        RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info );
+    }
+    __except(EXCEPTION_EXECUTE_HANDLER)
+    {
+    }
+#  endif
+#elif defined _GNU_SOURCE && !defined __EMSCRIPTEN__ && !defined __CYGWIN__
+    {
+        const auto sz = strlen( name );
+        if( sz <= 15 )
+        {
+            pthread_setname_np( pthread_self(), name );
+        }
+        else
+        {
+            char buf[16];
+            memcpy( buf, name, 15 );
+            buf[15] = '\0';
+            pthread_setname_np( pthread_self(), buf );
+        }
+    }
+#endif
+#ifdef TRACY_ENABLE
+    {
+        InitRPMallocThread();
+        const auto sz = strlen( name );
+        char* buf = (char*)tracy_malloc( sz+1 );
+        memcpy( buf, name, sz );
+        buf[sz+1] = '\0';
+        auto data = (ThreadNameData*)tracy_malloc( sizeof( ThreadNameData ) );
+        data->id = detail::GetThreadHandleImpl();
+        data->name = buf;
+        data->next = GetThreadNameData().load( std::memory_order_relaxed );
+        while( !GetThreadNameData().compare_exchange_weak( data->next, data, std::memory_order_release, std::memory_order_relaxed ) ) {}
+    }
+#endif
+}
+
+const char* GetThreadName( uint64_t id )
+{
+    static char buf[256];
+#ifdef TRACY_ENABLE
+    auto ptr = GetThreadNameData().load( std::memory_order_relaxed );
+    while( ptr )
+    {
+        if( ptr->id == id )
+        {
+            return ptr->name;
+        }
+        ptr = ptr->next;
+    }
+#else
+#  if defined _WIN32 || defined __CYGWIN__
+#    if defined NTDDI_WIN10_RS2 && NTDDI_VERSION >= NTDDI_WIN10_RS2
+    auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id );
+    if( hnd != 0 )
+    {
+        PWSTR tmp;
+        GetThreadDescription( hnd, &tmp );
+        auto ret = wcstombs( buf, tmp, 256 );
+        CloseHandle( hnd );
+        if( ret != 0 )
+        {
+            return buf;
+        }
+    }
+#    endif
+#  elif defined __GLIBC__ && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined __CYGWIN__
+    if( pthread_getname_np( (pthread_t)id, buf, 256 ) == 0 )
+    {
+        return buf;
+    }
+#  elif defined __linux__
+    int cs, fd;
+    char path[32];
+#   ifdef __ANDROID__
+    int tid = gettid();
+#   else
+    int tid = (int) syscall( SYS_gettid );
+#   endif
+    snprintf( path, sizeof( path ), "/proc/self/task/%d/comm", tid );
+    sprintf( buf, "%" PRIu64, id );
+#   ifndef __ANDROID__
+    pthread_setcancelstate( PTHREAD_CANCEL_DISABLE, &cs );
+#   endif
+    if ( ( fd = open( path, O_RDONLY ) ) > 0) {
+        int len = read( fd, buf, 255 );
+        if( len > 0 )
+        {
+            buf[len] = 0;
+            if( len > 1 && buf[len-1] == '\n' )
+            {
+                buf[len-1] = 0;
+            }
+        }
+        close( fd );
+    }
+#   ifndef __ANDROID__
+    pthread_setcancelstate( cs, 0 );
+#   endif
+    return buf;
+#  endif
+#endif
+    sprintf( buf, "%" PRIu64, id );
+    return buf;
+}
+
+}
diff --git a/libs/tracy/common/TracySystem.hpp b/libs/tracy/common/TracySystem.hpp
@@ -0,0 +1,80 @@
+#ifndef __TRACYSYSTEM_HPP__
+#define __TRACYSYSTEM_HPP__
+
+#if defined _WIN32 || defined __CYGWIN__
+#  ifndef _WINDOWS_
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+#  endif
+#elif defined __APPLE__ || ( !defined __ANDROID__ && !defined __linux__ )
+#  include <pthread.h>
+#endif
+
+#ifdef __linux__
+#  include <unistd.h>
+#  ifdef __ANDROID__
+#    include <sys/types.h>
+#  else
+#    include <sys/syscall.h>
+#  endif
+#elif defined __FreeBSD__
+#  include <sys/thr.h>
+#elif defined __NetBSD__ || defined __DragonFly__
+#  include <sys/lwp.h>
+#elif defined __OpenBSD__
+#  include <unistd.h>
+#endif
+
+#include <stdint.h>
+
+#include "TracyApi.h"
+
+namespace tracy
+{
+
+namespace detail
+{
+static inline uint64_t GetThreadHandleImpl()
+{
+#if defined _WIN32 || defined __CYGWIN__
+    static_assert( sizeof( decltype( GetCurrentThreadId() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" );
+    return uint64_t( GetCurrentThreadId() );
+#elif defined __APPLE__
+    uint64_t id;
+    pthread_threadid_np( pthread_self(), &id );
+    return id;
+#elif defined __ANDROID__
+    return (uint64_t)gettid();
+#elif defined __linux__
+    return (uint64_t)syscall( SYS_gettid );
+#elif defined __FreeBSD__
+    long id;
+    thr_self( &id );
+    return id;
+#elif defined __NetBSD__
+    return _lwp_self();
+#elif defined __DragonFly__
+    return lwp_gettid();
+#elif defined __OpenBSD__
+    return getthrid();
+#else
+    static_assert( sizeof( decltype( pthread_self() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" );
+    return uint64_t( pthread_self() );
+#endif
+}
+}
+
+#ifdef TRACY_ENABLE
+TRACY_API uint64_t GetThreadHandle();
+#else
+static inline uint64_t GetThreadHandle()
+{
+    return detail::GetThreadHandleImpl();
+}
+#endif
+
+void SetThreadName( const char* name );
+const char* GetThreadName( uint64_t id );
+
+}
+
+#endif
diff --git a/libs/tracy/common/tracy_benaphore.h b/libs/tracy/common/tracy_benaphore.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2015 Jeff Preshing
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+// claim that you wrote the original software. If you use this software
+// in a product, an acknowledgement in the product documentation would be
+// appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+// misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#ifndef __TRACY_CPP11OM_BENAPHORE_H__
+#define __TRACY_CPP11OM_BENAPHORE_H__
+
+#include <cassert>
+#include <thread>
+#include <atomic>
+#include "tracy_sema.h"
+
+namespace tracy
+{
+
+class NonRecursiveBenaphore
+{
+private:
+    std::atomic<int> m_contentionCount;
+    DefaultSemaphoreType m_sema;
+
+public:
+    NonRecursiveBenaphore() : m_contentionCount(0) {}
+
+    void lock()
+    {
+        if (m_contentionCount.fetch_add(1, std::memory_order_acquire) > 0)
+        {
+            m_sema.wait();
+        }
+    }
+
+    bool try_lock()
+    {
+        if (m_contentionCount.load(std::memory_order_relaxed) != 0)
+            return false;
+        int expected = 0;
+        return m_contentionCount.compare_exchange_strong(expected, 1, std::memory_order_acquire);
+    }
+
+    void unlock()
+    {
+        int oldCount = m_contentionCount.fetch_sub(1, std::memory_order_release);
+        assert(oldCount > 0);
+        if (oldCount > 1)
+        {
+            m_sema.signal();
+        }
+    }
+};
+
+}
+
+#endif // __CPP11OM_BENAPHORE_H__
diff --git a/libs/tracy/common/tracy_lz4.cpp b/libs/tracy/common/tracy_lz4.cpp
@@ -0,0 +1,2297 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+
+/*-************************************
+*  Tuning parameters
+**************************************/
+/*
+ * LZ4_HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+ */
+#ifndef LZ4_HEAPMODE
+#  define LZ4_HEAPMODE 0
+#endif
+
+/*
+ * ACCELERATION_DEFAULT :
+ * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
+ */
+#define ACCELERATION_DEFAULT 1
+
+
+/*-************************************
+*  CPU Feature Detection
+**************************************/
+/* LZ4_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which assembly generation depends on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef LZ4_FORCE_MEMORY_ACCESS   /* can be defined externally */
+#  if defined(__GNUC__) && \
+  ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \
+  || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define LZ4_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__)
+#    define LZ4_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for WinCE doesn't support Hardware bit count */
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+
+/*-************************************
+*  Dependency
+**************************************/
+/*
+ * LZ4_SRC_INCLUDED:
+ * Amalgamation flag, whether lz4.c is included
+ */
+#ifndef LZ4_SRC_INCLUDED
+#  define LZ4_SRC_INCLUDED 1
+#endif
+
+#ifndef LZ4_STATIC_LINKING_ONLY
+#define LZ4_STATIC_LINKING_ONLY
+#endif
+
+#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS
+#define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to LZ4_decompress_safe_withPrefix64k */
+#endif
+
+#include "tracy_lz4.hpp"
+/* see also "memory routines" below */
+
+
+/*-************************************
+*  Compiler Options
+**************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4293)        /* disable: C4293: too large shift (32-bits) */
+#endif  /* _MSC_VER */
+
+#ifndef LZ4_FORCE_INLINE
+#  ifdef _MSC_VER    /* Visual Studio */
+#    define LZ4_FORCE_INLINE static __forceinline
+#  else
+#    if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#      ifdef __GNUC__
+#        define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
+#      else
+#        define LZ4_FORCE_INLINE static inline
+#      endif
+#    else
+#      define LZ4_FORCE_INLINE static
+#    endif /* __STDC_VERSION__ */
+#  endif  /* _MSC_VER */
+#endif /* LZ4_FORCE_INLINE */
+
+/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
+ * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy8 does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
+#  define LZ4_FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
+#  define LZ4_FORCE_O2_INLINE_GCC_PPC64LE __attribute__((optimize("O2"))) LZ4_FORCE_INLINE
+#else
+#  define LZ4_FORCE_O2_GCC_PPC64LE
+#  define LZ4_FORCE_O2_INLINE_GCC_PPC64LE static
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#ifndef likely
+#define likely(expr)     expect((expr) != 0, 1)
+#endif
+#ifndef unlikely
+#define unlikely(expr)   expect((expr) != 0, 0)
+#endif
+
+
+/*-************************************
+*  Memory routines
+**************************************/
+#include <stdlib.h>   /* malloc, calloc, free */
+#define ALLOC(s)          malloc(s)
+#define ALLOC_AND_ZERO(s) calloc(1,s)
+#define FREEMEM(p)        free(p)
+#include <string.h>   /* memset, memcpy */
+#define MEM_INIT(p,v,s)   memset((p),(v),(s))
+
+
+/*-************************************
+*  Types
+**************************************/
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef uintptr_t uptrval;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef size_t              uptrval;   /* generally true, except OpenVMS-64 */
+#endif
+
+#if defined(__x86_64__)
+  typedef U64    reg_t;   /* 64-bits in x32 mode */
+#else
+  typedef size_t reg_t;   /* 32-bits in x32 mode */
+#endif
+
+namespace tracy
+{
+
+typedef enum {
+    notLimited = 0,
+    limitedOutput = 1,
+    fillOutput = 2
+} limitedOutput_directive;
+
+
+/*-************************************
+*  Reading and writing into memory
+**************************************/
+static unsigned LZ4_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental */
+    return one.c[0];
+}
+
+
+#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2)
+/* lie to the compiler about data alignment; use with caution */
+
+static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; }
+
+static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+
+#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign;
+
+static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+static U32 LZ4_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static reg_t LZ4_read_ARCH(const void* ptr) { return ((const unalign*)ptr)->uArch; }
+
+static void LZ4_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
+
+#else  /* safe and portable access using memcpy() */
+
+static U16 LZ4_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U32 LZ4_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static reg_t LZ4_read_ARCH(const void* memPtr)
+{
+    reg_t val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static void LZ4_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+static void LZ4_write32(void* memPtr, U32 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* LZ4_FORCE_MEMORY_ACCESS */
+
+
+static U16 LZ4_readLE16(const void* memPtr)
+{
+    if (LZ4_isLittleEndian()) {
+        return LZ4_read16(memPtr);
+    } else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + (p[1]<<8));
+    }
+}
+
+static void LZ4_writeLE16(void* memPtr, U16 value)
+{
+    if (LZ4_isLittleEndian()) {
+        LZ4_write16(memPtr, value);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE) value;
+        p[1] = (BYTE)(value>>8);
+    }
+}
+
+/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
+LZ4_FORCE_O2_INLINE_GCC_PPC64LE
+void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+    do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
+}
+
+static const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
+static const int      dec64table[8] = {0, 0, 0, -1, -4,  1, 2, 3};
+
+
+#ifndef LZ4_FAST_DEC_LOOP
+#  if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+#    define LZ4_FAST_DEC_LOOP 1
+#  else
+#    define LZ4_FAST_DEC_LOOP 0
+#  endif
+#endif
+
+#if LZ4_FAST_DEC_LOOP
+
+LZ4_FORCE_O2_INLINE_GCC_PPC64LE void
+LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    if (offset < 8) {
+        dstPtr[0] = srcPtr[0];
+        dstPtr[1] = srcPtr[1];
+        dstPtr[2] = srcPtr[2];
+        dstPtr[3] = srcPtr[3];
+        srcPtr += inc32table[offset];
+        memcpy(dstPtr+4, srcPtr, 4);
+        srcPtr -= dec64table[offset];
+        dstPtr += 8;
+    } else {
+        memcpy(dstPtr, srcPtr, 8);
+        dstPtr += 8;
+        srcPtr += 8;
+    }
+
+    LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
+}
+
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
+ * this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+LZ4_FORCE_O2_INLINE_GCC_PPC64LE void
+LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+    do { memcpy(d,s,16); memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
+}
+
+LZ4_FORCE_O2_INLINE_GCC_PPC64LE void
+LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    BYTE v[8];
+    switch(offset) {
+    case 1:
+        memset(v, *srcPtr, 8);
+        goto copy_loop;
+    case 2:
+        memcpy(v, srcPtr, 2);
+        memcpy(&v[2], srcPtr, 2);
+        memcpy(&v[4], &v[0], 4);
+        goto copy_loop;
+    case 4:
+        memcpy(v, srcPtr, 4);
+        memcpy(&v[4], srcPtr, 4);
+        goto copy_loop;
+    default:
+        LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+        return;
+    }
+
+ copy_loop:
+    memcpy(dstPtr, v, 8);
+    dstPtr += 8;
+    while (dstPtr < dstEnd) {
+        memcpy(dstPtr, v, 8);
+        dstPtr += 8;
+    }
+}
+#endif
+
+
+/*-************************************
+*  Common Constants
+**************************************/
+#define MINMATCH 4
+
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS   5   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MFLIMIT       12   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MATCH_SAFEGUARD_DISTANCE  ((2*WILDCOPYLENGTH) - MINMATCH)   /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */
+#define FASTLOOP_SAFE_DISTANCE 64
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#ifndef LZ4_DISTANCE_MAX   /* can be user - defined at compile time */
+#  define LZ4_DISTANCE_MAX 65535
+#endif
+
+#if (LZ4_DISTANCE_MAX > 65535)   /* max supported by LZ4 format */
+#  error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
+#endif
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*-************************************
+*  Error detection
+**************************************/
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1)
+#  include <assert.h>
+#else
+#  ifndef assert
+#    define assert(condition) ((void)0)
+#  endif
+#endif
+
+#define LZ4_STATIC_ASSERT(c)   { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use after variable declarations */
+
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2)
+#  include <stdio.h>
+static int g_debuglog_enable = 1;
+#  define DEBUGLOG(l, ...) {                                  \
+                if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) {  \
+                    fprintf(stderr, __FILE__ ": ");           \
+                    fprintf(stderr, __VA_ARGS__);             \
+                    fprintf(stderr, " \n");                   \
+            }   }
+#else
+#  define DEBUGLOG(l, ...)      {}    /* disabled */
+#endif
+
+
+/*-************************************
+*  Common functions
+**************************************/
+static unsigned LZ4_NbCommonBytes (reg_t val)
+{
+    if (LZ4_isLittleEndian()) {
+        if (sizeof(val)==8) {
+#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (int)(r>>3);
+#       elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+                                                     0, 3, 1, 3, 1, 4, 2, 7,
+                                                     0, 2, 3, 6, 1, 5, 3, 5,
+                                                     1, 3, 4, 4, 2, 5, 6, 7,
+                                                     7, 0, 1, 2, 3, 3, 4, 6,
+                                                     2, 6, 5, 5, 3, 4, 5, 6,
+                                                     7, 1, 2, 4, 6, 4, 4, 5,
+                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        } else /* 32 bits */ {
+#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r;
+            _BitScanForward( &r, (U32)val );
+            return (int)(r>>3);
+#       elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+                                                     3, 2, 2, 1, 3, 2, 0, 1,
+                                                     3, 3, 1, 2, 2, 2, 2, 0,
+                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    } else   /* Big Endian CPU */ {
+        if (sizeof(val)==8) {   /* 64-bits */
+#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_clzll((U64)val) >> 3);
+#       else
+            static const U32 by32 = sizeof(val)*4;  /* 32 on 64 bits (goal), 16 on 32 bits.
+                Just to avoid some static analyzer complaining about shift by 32 on 32-bits target.
+                Note that this code path is never triggered in 32-bits mode. */
+            unsigned r;
+            if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        } else /* 32 bits */ {
+#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        }
+    }
+}
+
+#define STEPSIZE sizeof(reg_t)
+LZ4_FORCE_INLINE
+unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+{
+    const BYTE* const pStart = pIn;
+
+    if (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) {
+            pIn+=STEPSIZE; pMatch+=STEPSIZE;
+        } else {
+            return LZ4_NbCommonBytes(diff);
+    }   }
+
+    while (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
+        pIn += LZ4_NbCommonBytes(diff);
+        return (unsigned)(pIn - pStart);
+    }
+
+    if ((STEPSIZE==8) && (pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (unsigned)(pIn - pStart);
+}
+
+
+#ifndef LZ4_COMMONDEFS_ONLY
+/*-************************************
+*  Local Constants
+**************************************/
+static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
+static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
+
+
+/*-************************************
+*  Local Structures and types
+**************************************/
+typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
+
+/**
+ * This enum distinguishes several different modes of accessing previous
+ * content in the stream.
+ *
+ * - noDict        : There is no preceding content.
+ * - withPrefix64k : Table entries up to ctx->dictSize before the current blob
+ *                   blob being compressed are valid and refer to the preceding
+ *                   content (of length ctx->dictSize), which is available
+ *                   contiguously preceding in memory the content currently
+ *                   being compressed.
+ * - usingExtDict  : Like withPrefix64k, but the preceding content is somewhere
+ *                   else in memory, starting at ctx->dictionary with length
+ *                   ctx->dictSize.
+ * - usingDictCtx  : Like usingExtDict, but everything concerning the preceding
+ *                   content is in a separate context, pointed to by
+ *                   ctx->dictCtx. ctx->dictionary, ctx->dictSize, and table
+ *                   entries in the current context that refer to positions
+ *                   preceding the beginning of the current compression are
+ *                   ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx
+ *                   ->dictSize describe the location and size of the preceding
+ *                   content, and matches are found by looking in the ctx
+ *                   ->dictCtx->hashTable.
+ */
+typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+
+/*-************************************
+*  Local Utils
+**************************************/
+int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; }
+int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
+
+
+/*-************************************
+*  Internal Definitions used in Tests
+**************************************/
+
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize);
+
+int LZ4_decompress_safe_forceExtDict(const char* in, char* out, int inSize, int outSize, const void* dict, size_t dictSize);
+
+/*-******************************
+*  Compression functions
+********************************/
+static U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
+{
+    if (tableType == byU16)
+        return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+    else
+        return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+static U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
+{
+    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
+    if (LZ4_isLittleEndian()) {
+        const U64 prime5bytes = 889523592379ULL;
+        return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
+    } else {
+        const U64 prime8bytes = 11400714785074694791ULL;
+        return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
+    }
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType)
+{
+    if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType);
+    return LZ4_hash4(LZ4_read32(p), tableType);
+}
+
+static void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType)
+{
+    switch (tableType)
+    {
+    default: /* fallthrough */
+    case clearedTable: /* fallthrough */
+    case byPtr: { /* illegal! */ assert(0); return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = idx; return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; assert(idx < 65536); hashTable[h] = (U16)idx; return; }
+    }
+}
+
+static void LZ4_putPositionOnHash(const BYTE* p, U32 h,
+                                  void* tableBase, tableType_t const tableType,
+                            const BYTE* srcBase)
+{
+    switch (tableType)
+    {
+    case clearedTable: { /* illegal! */ assert(0); return; }
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; }
+    }
+}
+
+LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 const h = LZ4_hashPosition(p, tableType);
+    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+/* LZ4_getIndexOnHash() :
+ * Index of match position registered in hash table.
+ * hash position must be calculated by using base+index, or dictBase+index.
+ * Assumption 1 : only valid if tableType == byU32 or byU16.
+ * Assumption 2 : h is presumed valid (within limits of hash table)
+ */
+static U32 LZ4_getIndexOnHash(U32 h, const void* tableBase, tableType_t tableType)
+{
+    LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2);
+    if (tableType == byU32) {
+        const U32* const hashTable = (const U32*) tableBase;
+        assert(h < (1U << (LZ4_MEMORY_USAGE-2)));
+        return hashTable[h];
+    }
+    if (tableType == byU16) {
+        const U16* const hashTable = (const U16*) tableBase;
+        assert(h < (1U << (LZ4_MEMORY_USAGE-1)));
+        return hashTable[h];
+    }
+    assert(0); return 0;  /* forbidden case */
+}
+
+static const BYTE* LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    if (tableType == byPtr) { const BYTE* const* hashTable = (const BYTE* const*) tableBase; return hashTable[h]; }
+    if (tableType == byU32) { const U32* const hashTable = (const U32*) tableBase; return hashTable[h] + srcBase; }
+    { const U16* const hashTable = (const U16*) tableBase; return hashTable[h] + srcBase; }   /* default, to ensure a return */
+}
+
+LZ4_FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p,
+                                             const void* tableBase, tableType_t tableType,
+                                             const BYTE* srcBase)
+{
+    U32 const h = LZ4_hashPosition(p, tableType);
+    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+LZ4_FORCE_INLINE void LZ4_prepareTable(
+        LZ4_stream_t_internal* const cctx,
+        const int inputSize,
+        const tableType_t tableType) {
+    /* If compression failed during the previous step, then the context
+     * is marked as dirty, therefore, it has to be fully reset.
+     */
+    if (cctx->dirty) {
+        DEBUGLOG(5, "LZ4_prepareTable: Full reset for %p", cctx);
+        MEM_INIT(cctx, 0, sizeof(LZ4_stream_t_internal));
+        return;
+    }
+
+    /* If the table hasn't been used, it's guaranteed to be zeroed out, and is
+     * therefore safe to use no matter what mode we're in. Otherwise, we figure
+     * out if it's safe to leave as is or whether it needs to be reset.
+     */
+    if (cctx->tableType != clearedTable) {
+        if (cctx->tableType != tableType
+          || (tableType == byU16 && cctx->currentOffset + inputSize >= 0xFFFFU)
+          || (tableType == byU32 && cctx->currentOffset > 1 GB)
+          || tableType == byPtr
+          || inputSize >= 4 KB)
+        {
+            DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", cctx);
+            MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE);
+            cctx->currentOffset = 0;
+            cctx->tableType = clearedTable;
+        } else {
+            DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)");
+        }
+    }
+
+    /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back, is faster
+     * than compressing without a gap. However, compressing with
+     * currentOffset == 0 is faster still, so we preserve that case.
+     */
+    if (cctx->currentOffset != 0 && tableType == byU32) {
+        DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset");
+        cctx->currentOffset += 64 KB;
+    }
+
+    /* Finally, clear history */
+    cctx->dictCtx = NULL;
+    cctx->dictionary = NULL;
+    cctx->dictSize = 0;
+}
+
+/** LZ4_compress_generic() :
+    inlined, to ensure branches are decided at compilation time */
+LZ4_FORCE_INLINE int LZ4_compress_generic(
+                 LZ4_stream_t_internal* const cctx,
+                 const char* const source,
+                 char* const dest,
+                 const int inputSize,
+                 int *inputConsumed, /* only written when outputDirective == fillOutput */
+                 const int maxOutputSize,
+                 const limitedOutput_directive outputDirective,
+                 const tableType_t tableType,
+                 const dict_directive dictDirective,
+                 const dictIssue_directive dictIssue,
+                 const int acceleration)
+{
+    int result;
+    const BYTE* ip = (const BYTE*) source;
+
+    U32 const startIndex = cctx->currentOffset;
+    const BYTE* base = (const BYTE*) source - startIndex;
+    const BYTE* lowLimit;
+
+    const LZ4_stream_t_internal* dictCtx = (const LZ4_stream_t_internal*) cctx->dictCtx;
+    const BYTE* const dictionary =
+        dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary;
+    const U32 dictSize =
+        dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize;
+    const U32 dictDelta = (dictDirective == usingDictCtx) ? startIndex - dictCtx->currentOffset : 0;   /* make indexes in dictCtx comparable with index in current context */
+
+    int const maybe_extMem = (dictDirective == usingExtDict) || (dictDirective == usingDictCtx);
+    U32 const prefixIdxLimit = startIndex - dictSize;   /* used when dictDirective == dictSmall */
+    const BYTE* const dictEnd = dictionary + dictSize;
+    const BYTE* anchor = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    /* the dictCtx currentOffset is indexed on the start of the dictionary,
+     * while a dictionary in the current context precedes the currentOffset */
+    const BYTE* dictBase = (dictDirective == usingDictCtx) ?
+                            dictionary + dictSize - dictCtx->currentOffset :
+                            dictionary + dictSize - startIndex;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const olimit = op + maxOutputSize;
+
+    U32 offset = 0;
+    U32 forwardH;
+
+    DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, tableType=%u", inputSize, tableType);
+    /* If init conditions are not met, we don't have to mark stream
+     * as having dirty context, since no action was taken yet */
+    if (outputDirective == fillOutput && maxOutputSize < 1) return 0;   /* Impossible to store anything */
+    if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0;           /* Unsupported inputSize, too large (or negative) */
+    if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0;  /* Size too large (not within 64K limit) */
+    if (tableType==byPtr) assert(dictDirective==noDict);      /* only supported use case with byPtr */
+    assert(acceleration >= 1);
+
+    lowLimit = (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0);
+
+    /* Update context state */
+    if (dictDirective == usingDictCtx) {
+        /* Subsequent linked blocks can't use the dictionary. */
+        /* Instead, they use the block we just compressed. */
+        cctx->dictCtx = NULL;
+        cctx->dictSize = (U32)inputSize;
+    } else {
+        cctx->dictSize += (U32)inputSize;
+    }
+    cctx->currentOffset += (U32)inputSize;
+    cctx->tableType = (U16)tableType;
+
+    if (inputSize<LZ4_minLength) goto _last_literals;        /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    LZ4_putPosition(ip, cctx->hashTable, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; ) {
+        const BYTE* match;
+        BYTE* token;
+
+        /* Find a match */
+        if (tableType == byPtr) {
+            const BYTE* forwardIp = ip;
+            int step = 1;
+            int searchMatchNb = acceleration << LZ4_skipTrigger;
+            do {
+                U32 const h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
+                assert(ip < mflimitPlusOne);
+
+                match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType, base);
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType, base);
+
+            } while ( (match+LZ4_DISTANCE_MAX < ip)
+                   || (LZ4_read32(match) != LZ4_read32(ip)) );
+
+        } else {   /* byU32, byU16 */
+
+            const BYTE* forwardIp = ip;
+            int step = 1;
+            int searchMatchNb = acceleration << LZ4_skipTrigger;
+            do {
+                U32 const h = forwardH;
+                U32 const current = (U32)(forwardIp - base);
+                U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+                assert(matchIndex <= current);
+                assert(forwardIp - base < (ptrdiff_t)(2 GB - 1));
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
+                assert(ip < mflimitPlusOne);
+
+                if (dictDirective == usingDictCtx) {
+                    if (matchIndex < startIndex) {
+                        /* there was no match, try the dictionary */
+                        assert(tableType == byU32);
+                        matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                        match = dictBase + matchIndex;
+                        matchIndex += dictDelta;   /* make dictCtx index comparable with current context */
+                        lowLimit = dictionary;
+                    } else {
+                        match = base + matchIndex;
+                        lowLimit = (const BYTE*)source;
+                    }
+                } else if (dictDirective==usingExtDict) {
+                    if (matchIndex < startIndex) {
+                        DEBUGLOG(7, "extDict candidate: matchIndex=%5u  <  startIndex=%5u", matchIndex, startIndex);
+                        assert(startIndex - matchIndex >= MINMATCH);
+                        match = dictBase + matchIndex;
+                        lowLimit = dictionary;
+                    } else {
+                        match = base + matchIndex;
+                        lowLimit = (const BYTE*)source;
+                    }
+                } else {   /* single continuous memory segment */
+                    match = base + matchIndex;
+                }
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+
+                if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) continue;    /* match outside of valid area */
+                assert(matchIndex < current);
+                if ((tableType != byU16) && (matchIndex+LZ4_DISTANCE_MAX < current)) continue;  /* too far */
+                if (tableType == byU16) assert((current - matchIndex) <= LZ4_DISTANCE_MAX);     /* too_far presumed impossible with byU16 */
+
+                if (LZ4_read32(match) == LZ4_read32(ip)) {
+                    if (maybe_extMem) offset = current - matchIndex;
+                    break;   /* match found */
+                }
+
+            } while(1);
+        }
+
+        /* Catch up */
+        while (((ip>anchor) & (match > lowLimit)) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
+
+        /* Encode Literals */
+        {   unsigned const litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if ((outputDirective == limitedOutput) &&  /* Check output buffer overflow */
+                (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)) )
+                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+
+            if ((outputDirective == fillOutput) &&
+                (unlikely(op + (litLength+240)/255 /* litlen */ + litLength /* literals */ + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit))) {
+                op--;
+                goto _last_literals;
+            }
+            if (litLength >= RUN_MASK) {
+                int len = (int)(litLength - RUN_MASK);
+                *token = (RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy8(op, anchor, op+litLength);
+            op+=litLength;
+            DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                        (int)(anchor-(const BYTE*)source), litLength, (int)(ip-(const BYTE*)source));
+        }
+
+_next_match:
+        /* at this stage, the following variables must be correctly set :
+         * - ip : at start of LZ operation
+         * - match : at start of previous pattern occurence; can be within current prefix, or within extDict
+         * - offset : if maybe_ext_memSegment==1 (constant)
+         * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise
+         * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written
+         */
+
+        if ((outputDirective == fillOutput) &&
+            (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit)) {
+            /* the match was too close to the end, rewind and go to last literals */
+            op = token;
+            goto _last_literals;
+        }
+
+        /* Encode Offset */
+        if (maybe_extMem) {   /* static test */
+            DEBUGLOG(6, "             with offset=%u  (ext if > %i)", offset, (int)(ip - (const BYTE*)source));
+            assert(offset <= LZ4_DISTANCE_MAX && offset > 0);
+            LZ4_writeLE16(op, (U16)offset); op+=2;
+        } else  {
+            DEBUGLOG(6, "             with offset=%u  (same segment)", (U32)(ip - match));
+            assert(ip-match <= LZ4_DISTANCE_MAX);
+            LZ4_writeLE16(op, (U16)(ip - match)); op+=2;
+        }
+
+        /* Encode MatchLength */
+        {   unsigned matchCode;
+
+            if ( (dictDirective==usingExtDict || dictDirective==usingDictCtx)
+              && (lowLimit==dictionary) /* match within extDict */ ) {
+                const BYTE* limit = ip + (dictEnd-match);
+                assert(dictEnd > match);
+                if (limit > matchlimit) limit = matchlimit;
+                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
+                ip += (size_t)matchCode + MINMATCH;
+                if (ip==limit) {
+                    unsigned const more = LZ4_count(limit, (const BYTE*)source, matchlimit);
+                    matchCode += more;
+                    ip += more;
+                }
+                DEBUGLOG(6, "             with matchLength=%u starting in extDict", matchCode+MINMATCH);
+            } else {
+                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+                ip += (size_t)matchCode + MINMATCH;
+                DEBUGLOG(6, "             with matchLength=%u", matchCode+MINMATCH);
+            }
+
+            if ((outputDirective) &&    /* Check output buffer overflow */
+                (unlikely(op + (1 + LASTLITERALS) + (matchCode>>8) > olimit)) ) {
+                if (outputDirective == fillOutput) {
+                    /* Match description too long : reduce it */
+                    U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 2 - 1 - LASTLITERALS) * 255;
+                    ip -= matchCode - newMatchCode;
+                    matchCode = newMatchCode;
+                } else {
+                    assert(outputDirective == limitedOutput);
+                    return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+                }
+            }
+            if (matchCode >= ML_MASK) {
+                *token += ML_MASK;
+                matchCode -= ML_MASK;
+                LZ4_write32(op, 0xFFFFFFFF);
+                while (matchCode >= 4*255) {
+                    op+=4;
+                    LZ4_write32(op, 0xFFFFFFFF);
+                    matchCode -= 4*255;
+                }
+                op += matchCode / 255;
+                *op++ = (BYTE)(matchCode % 255);
+            } else
+                *token += (BYTE)(matchCode);
+        }
+
+        anchor = ip;
+
+        /* Test end of chunk */
+        if (ip >= mflimitPlusOne) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, cctx->hashTable, tableType, base);
+
+        /* Test next position */
+        if (tableType == byPtr) {
+
+            match = LZ4_getPosition(ip, cctx->hashTable, tableType, base);
+            LZ4_putPosition(ip, cctx->hashTable, tableType, base);
+            if ( (match+LZ4_DISTANCE_MAX >= ip)
+              && (LZ4_read32(match) == LZ4_read32(ip)) )
+            { token=op++; *token=0; goto _next_match; }
+
+        } else {   /* byU32, byU16 */
+
+            U32 const h = LZ4_hashPosition(ip, tableType);
+            U32 const current = (U32)(ip-base);
+            U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+            assert(matchIndex < current);
+            if (dictDirective == usingDictCtx) {
+                if (matchIndex < startIndex) {
+                    /* there was no match, try the dictionary */
+                    matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                    match = dictBase + matchIndex;
+                    lowLimit = dictionary;   /* required for match length counter */
+                    matchIndex += dictDelta;
+                } else {
+                    match = base + matchIndex;
+                    lowLimit = (const BYTE*)source;  /* required for match length counter */
+                }
+            } else if (dictDirective==usingExtDict) {
+                if (matchIndex < startIndex) {
+                    match = dictBase + matchIndex;
+                    lowLimit = dictionary;   /* required for match length counter */
+                } else {
+                    match = base + matchIndex;
+                    lowLimit = (const BYTE*)source;   /* required for match length counter */
+                }
+            } else {   /* single memory segment */
+                match = base + matchIndex;
+            }
+            LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+            assert(matchIndex < current);
+            if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1)
+              && ((tableType==byU16) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current))
+              && (LZ4_read32(match) == LZ4_read32(ip)) ) {
+                token=op++;
+                *token=0;
+                if (maybe_extMem) offset = current - matchIndex;
+                DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                            (int)(anchor-(const BYTE*)source), 0, (int)(ip-(const BYTE*)source));
+                goto _next_match;
+            }
+        }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {   size_t lastRun = (size_t)(iend - anchor);
+        if ( (outputDirective) &&  /* Check output buffer overflow */
+            (op + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > olimit)) {
+            if (outputDirective == fillOutput) {
+                /* adapt lastRun to fill 'dst' */
+                assert(olimit >= op);
+                lastRun  = (size_t)(olimit-op) - 1;
+                lastRun -= (lastRun+240)/255;
+            } else {
+                assert(outputDirective == limitedOutput);
+                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+            }
+        }
+        if (lastRun >= RUN_MASK) {
+            size_t accumulator = lastRun - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        } else {
+            *op++ = (BYTE)(lastRun<<ML_BITS);
+        }
+        memcpy(op, anchor, lastRun);
+        ip = anchor + lastRun;
+        op += lastRun;
+    }
+
+    if (outputDirective == fillOutput) {
+        *inputConsumed = (int) (((const char*)ip)-source);
+    }
+    DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", inputSize, (int)(((char*)op) - dest));
+    result = (int)(((char*)op) - dest);
+    assert(result > 0);
+    return result;
+}
+
+
+int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t_internal* const ctx = & LZ4_initStream(state, sizeof(LZ4_stream_t)) -> internal_donotuse;
+    assert(ctx != NULL);
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+    if (maxOutputSize >= LZ4_compressBound(inputSize)) {
+        if (inputSize < LZ4_64Klimit) {
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+        }
+    } else {
+        if (inputSize < LZ4_64Klimit) {;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+        }
+    }
+}
+
+/**
+ * LZ4_compress_fast_extState_fastReset() :
+ * A variant of LZ4_compress_fast_extState().
+ *
+ * Using this variant avoids an expensive initialization step. It is only safe
+ * to call if the state buffer is known to be correctly initialized already
+ * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of
+ * "correctly initialized").
+ */
+int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration)
+{
+    LZ4_stream_t_internal* ctx = &((LZ4_stream_t*)state)->internal_donotuse;
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+
+    if (dstCapacity >= LZ4_compressBound(srcSize)) {
+        if (srcSize < LZ4_64Klimit) {
+            const tableType_t tableType = byU16;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            if (ctx->currentOffset) {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, dictSmall, acceleration);
+            } else {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+            }
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+        }
+    } else {
+        if (srcSize < LZ4_64Klimit) {
+            const tableType_t tableType = byU16;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            if (ctx->currentOffset) {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, dictSmall, acceleration);
+            } else {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+            }
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+        }
+    }
+}
+
+
+int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    int result;
+#if (LZ4_HEAPMODE)
+    LZ4_stream_t* ctxPtr = ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+    if (ctxPtr == NULL) return 0;
+#else
+    LZ4_stream_t ctx;
+    LZ4_stream_t* const ctxPtr = &ctx;
+#endif
+    result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration);
+
+#if (LZ4_HEAPMODE)
+    FREEMEM(ctxPtr);
+#endif
+    return result;
+}
+
+
+int LZ4_compress_default(const char* src, char* dst, int srcSize, int maxOutputSize)
+{
+    return LZ4_compress_fast(src, dst, srcSize, maxOutputSize, 1);
+}
+
+
+/* hidden debug function */
+/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */
+int LZ4_compress_fast_force(const char* src, char* dst, int srcSize, int dstCapacity, int acceleration)
+{
+    LZ4_stream_t ctx;
+    LZ4_initStream(&ctx, sizeof(ctx));
+
+    if (srcSize < LZ4_64Klimit) {
+        return LZ4_compress_generic(&ctx.internal_donotuse, src, dst, srcSize, NULL, dstCapacity, limitedOutput, byU16,    noDict, noDictIssue, acceleration);
+    } else {
+        tableType_t const addrMode = (sizeof(void*) > 4) ? byU32 : byPtr;
+        return LZ4_compress_generic(&ctx.internal_donotuse, src, dst, srcSize, NULL, dstCapacity, limitedOutput, addrMode, noDict, noDictIssue, acceleration);
+    }
+}
+
+
+/* Note!: This function leaves the stream in an unclean/broken state!
+ * It is not safe to subsequently use the same state with a _fastReset() or
+ * _continue() call without resetting it. */
+static int LZ4_compress_destSize_extState (LZ4_stream_t* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+    void* const s = LZ4_initStream(state, sizeof (*state));
+    assert(s != NULL); (void)s;
+
+    if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) {  /* compression success is guaranteed */
+        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1);
+    } else {
+        if (*srcSizePtr < LZ4_64Klimit) {
+            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, 1);
+        } else {
+            tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, 1);
+    }   }
+}
+
+
+int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+#if (LZ4_HEAPMODE)
+    LZ4_stream_t* ctx = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+    if (ctx == NULL) return 0;
+#else
+    LZ4_stream_t ctxBody;
+    LZ4_stream_t* ctx = &ctxBody;
+#endif
+
+    int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize);
+
+#if (LZ4_HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+
+
+/*-******************************
+*  Streaming functions
+********************************/
+
+LZ4_stream_t* LZ4_createStream(void)
+{
+    LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));
+    LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal));    /* A compilation error here means LZ4_STREAMSIZE is not large enough */
+    DEBUGLOG(4, "LZ4_createStream %p", lz4s);
+    if (lz4s == NULL) return NULL;
+    LZ4_initStream(lz4s, sizeof(*lz4s));
+    return lz4s;
+}
+
+#ifndef _MSC_VER  /* for some reason, Visual fails the aligment test on 32-bit x86 :
+                     it reports an aligment of 8-bytes,
+                     while actually aligning LZ4_stream_t on 4 bytes. */
+static size_t LZ4_stream_t_alignment(void)
+{
+    struct { char c; LZ4_stream_t t; } t_a;
+    return sizeof(t_a) - sizeof(t_a.t);
+}
+#endif
+
+LZ4_stream_t* LZ4_initStream (void* buffer, size_t size)
+{
+    DEBUGLOG(5, "LZ4_initStream");
+    if (buffer == NULL) return NULL;
+    if (size < sizeof(LZ4_stream_t)) return NULL;
+#ifndef _MSC_VER  /* for some reason, Visual fails the aligment test on 32-bit x86 :
+                     it reports an aligment of 8-bytes,
+                     while actually aligning LZ4_stream_t on 4 bytes. */
+    if (((size_t)buffer) & (LZ4_stream_t_alignment() - 1)) return NULL;  /* alignment check */
+#endif
+    MEM_INIT(buffer, 0, sizeof(LZ4_stream_t));
+    return (LZ4_stream_t*)buffer;
+}
+
+/* resetStream is now deprecated,
+ * prefer initStream() which is more general */
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+{
+    DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", LZ4_stream);
+    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t));
+}
+
+void LZ4_resetStream_fast(LZ4_stream_t* ctx) {
+    LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32);
+}
+
+int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+{
+    if (!LZ4_stream) return 0;   /* support free on NULL */
+    DEBUGLOG(5, "LZ4_freeStream %p", LZ4_stream);
+    FREEMEM(LZ4_stream);
+    return (0);
+}
+
+
+#define HASH_UNIT sizeof(reg_t)
+int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    LZ4_stream_t_internal* dict = &LZ4_dict->internal_donotuse;
+    const tableType_t tableType = byU32;
+    const BYTE* p = (const BYTE*)dictionary;
+    const BYTE* const dictEnd = p + dictSize;
+    const BYTE* base;
+
+    DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, dictionary, LZ4_dict);
+
+    /* It's necessary to reset the context,
+     * and not just continue it with prepareTable()
+     * to avoid any risk of generating overflowing matchIndex
+     * when compressing using this dictionary */
+    LZ4_resetStream(LZ4_dict);
+
+    /* We always increment the offset by 64 KB, since, if the dict is longer,
+     * we truncate it to the last 64k, and if it's shorter, we still want to
+     * advance by a whole window length so we can provide the guarantee that
+     * there are only valid offsets in the window, which allows an optimization
+     * in LZ4_compress_fast_continue() where it uses noDictIssue even when the
+     * dictionary isn't a full 64k. */
+
+    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
+    base = dictEnd - 64 KB - dict->currentOffset;
+    dict->dictionary = p;
+    dict->dictSize = (U32)(dictEnd - p);
+    dict->currentOffset += 64 KB;
+    dict->tableType = tableType;
+
+    if (dictSize < (int)HASH_UNIT) {
+        return 0;
+    }
+
+    while (p <= dictEnd-HASH_UNIT) {
+        LZ4_putPosition(p, dict->hashTable, tableType, base);
+        p+=3;
+    }
+
+    return (int)dict->dictSize;
+}
+
+void LZ4_attach_dictionary(LZ4_stream_t *working_stream, const LZ4_stream_t *dictionary_stream) {
+    /* Calling LZ4_resetStream_fast() here makes sure that changes will not be
+     * erased by subsequent calls to LZ4_resetStream_fast() in case stream was
+     * marked as having dirty context, e.g. requiring full reset.
+     */
+    LZ4_resetStream_fast(working_stream);
+
+    if (dictionary_stream != NULL) {
+        /* If the current offset is zero, we will never look in the
+         * external dictionary context, since there is no value a table
+         * entry can take that indicate a miss. In that case, we need
+         * to bump the offset to something non-zero.
+         */
+        if (working_stream->internal_donotuse.currentOffset == 0) {
+            working_stream->internal_donotuse.currentOffset = 64 KB;
+        }
+        working_stream->internal_donotuse.dictCtx = &(dictionary_stream->internal_donotuse);
+    } else {
+        working_stream->internal_donotuse.dictCtx = NULL;
+    }
+}
+
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize)
+{
+    assert(nextSize >= 0);
+    if (LZ4_dict->currentOffset + (unsigned)nextSize > 0x80000000) {   /* potential ptrdiff_t overflow (32-bits mode) */
+        /* rescale hash table */
+        U32 const delta = LZ4_dict->currentOffset - 64 KB;
+        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+        int i;
+        DEBUGLOG(4, "LZ4_renormDictT");
+        for (i=0; i<LZ4_HASH_SIZE_U32; i++) {
+            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
+            else LZ4_dict->hashTable[i] -= delta;
+        }
+        LZ4_dict->currentOffset = 64 KB;
+        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
+        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+    }
+}
+
+
+int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream,
+                                const char* source, char* dest,
+                                int inputSize, int maxOutputSize,
+                                int acceleration)
+{
+    const tableType_t tableType = byU32;
+    LZ4_stream_t_internal* streamPtr = &LZ4_stream->internal_donotuse;
+    const BYTE* dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i)", inputSize);
+
+    if (streamPtr->dirty) return 0;   /* Uninitialized structure detected */
+    LZ4_renormDictT(streamPtr, inputSize);   /* avoid index overflow */
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+
+    /* invalidate tiny dictionaries */
+    if ( (streamPtr->dictSize-1 < 4-1)   /* intentional underflow */
+      && (dictEnd != (const BYTE*)source) ) {
+        DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, streamPtr->dictionary);
+        streamPtr->dictSize = 0;
+        streamPtr->dictionary = (const BYTE*)source;
+        dictEnd = (const BYTE*)source;
+    }
+
+    /* Check overlapping input/dictionary space */
+    {   const BYTE* sourceEnd = (const BYTE*) source + inputSize;
+        if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) {
+            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+            streamPtr->dictionary = dictEnd - streamPtr->dictSize;
+        }
+    }
+
+    /* prefix mode : source data follows dictionary */
+    if (dictEnd == (const BYTE*)source) {
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration);
+        else
+            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration);
+    }
+
+    /* external dictionary mode */
+    {   int result;
+        if (streamPtr->dictCtx) {
+            /* We depend here on the fact that dictCtx'es (produced by
+             * LZ4_loadDict) guarantee that their tables contain no references
+             * to offsets between dictCtx->currentOffset - 64 KB and
+             * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe
+             * to use noDictIssue even when the dict isn't a full 64 KB.
+             */
+            if (inputSize > 4 KB) {
+                /* For compressing large blobs, it is faster to pay the setup
+                 * cost to copy the dictionary's tables into the active context,
+                 * so that the compression loop is only looking into one table.
+                 */
+                memcpy(streamPtr, streamPtr->dictCtx, sizeof(LZ4_stream_t));
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+            } else {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration);
+            }
+        } else {
+            if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration);
+            } else {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+            }
+        }
+        streamPtr->dictionary = (const BYTE*)source;
+        streamPtr->dictSize = (U32)inputSize;
+        return result;
+    }
+}
+
+
+/* Hidden debug function, to force-test external dictionary mode */
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize)
+{
+    LZ4_stream_t_internal* streamPtr = &LZ4_dict->internal_donotuse;
+    int result;
+
+    LZ4_renormDictT(streamPtr, srcSize);
+
+    if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, dictSmall, 1);
+    } else {
+        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
+    }
+
+    streamPtr->dictionary = (const BYTE*)source;
+    streamPtr->dictSize = (U32)srcSize;
+
+    return result;
+}
+
+
+/*! LZ4_saveDict() :
+ *  If previously compressed data block is not guaranteed to remain available at its memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  Note : you don't need to call LZ4_loadDict() afterwards,
+ *         dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue().
+ *  Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error.
+ */
+int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+    LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
+    const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize;
+
+    if ((U32)dictSize > 64 KB) dictSize = 64 KB;   /* useless to define a dictionary > 64 KB */
+    if ((U32)dictSize > dict->dictSize) dictSize = (int)dict->dictSize;
+
+    memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
+
+    dict->dictionary = (const BYTE*)safeBuffer;
+    dict->dictSize = (U32)dictSize;
+
+    return dictSize;
+}
+
+
+
+/*-*******************************
+ *  Decompression functions
+ ********************************/
+
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
+
+#undef MIN
+#define MIN(a,b)    ( (a) < (b) ? (a) : (b) )
+
+/* Read the variable-length literal or match length.
+ *
+ * ip - pointer to use as input.
+ * lencheck - end ip.  Return an error if ip advances >= lencheck.
+ * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
+ * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
+ * error (output) - error code.  Should be set to 0 before call.
+ */
+typedef enum { loop_error = -2, initial_error = -1, ok = 0 } variable_length_error;
+LZ4_FORCE_INLINE unsigned
+read_variable_length(const BYTE**ip, const BYTE* lencheck, int loop_check, int initial_check, variable_length_error* error)
+{
+  unsigned length = 0;
+  unsigned s;
+  if (initial_check && unlikely((*ip) >= lencheck)) {    /* overflow detection */
+    *error = initial_error;
+    return length;
+  }
+  do {
+    s = **ip;
+    (*ip)++;
+    length += s;
+    if (loop_check && unlikely((*ip) >= lencheck)) {    /* overflow detection */
+      *error = loop_error;
+      return length;
+    }
+  } while (s==255);
+
+  return length;
+}
+
+/*! LZ4_decompress_generic() :
+ *  This generic decompression function covers all use cases.
+ *  It shall be instantiated several times, using different sets of directives.
+ *  Note that it is important for performance that this function really get inlined,
+ *  in order to remove useless branches during compilation optimization.
+ */
+LZ4_FORCE_INLINE int
+LZ4_decompress_generic(
+                 const char* const src,
+                 char* const dst,
+                 int srcSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is `dstCapacity` */
+
+                 endCondition_directive endOnInput,   /* endOnOutputSize, endOnInputSize */
+                 earlyEnd_directive partialDecoding,  /* full, partial */
+                 dict_directive dict,                 /* noDict, withPrefix64k, usingExtDict */
+                 const BYTE* const lowPrefix,  /* always <= dst, == dst when no prefix */
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note : = 0 if noDict */
+                 )
+{
+    if (src == NULL) return -1;
+
+    {   const BYTE* ip = (const BYTE*) src;
+        const BYTE* const iend = ip + srcSize;
+
+        BYTE* op = (BYTE*) dst;
+        BYTE* const oend = op + outputSize;
+        BYTE* cpy;
+
+        const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
+
+        const int safeDecode = (endOnInput==endOnInputSize);
+        const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+
+
+        /* Set up the "end" pointers for the shortcut. */
+        const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+        const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+
+        const BYTE* match;
+        size_t offset;
+        unsigned token;
+        size_t length;
+
+
+        DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize);
+
+        /* Special cases */
+        assert(lowPrefix <= op);
+        if ((endOnInput) && (unlikely(outputSize==0))) return ((srcSize==1) && (*ip==0)) ? 0 : -1;  /* Empty output buffer */
+        if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0 ? 1 : -1);
+        if ((endOnInput) && unlikely(srcSize==0)) return -1;
+
+	/* Currently the fast loop shows a regression on qualcomm arm chips. */
+#if LZ4_FAST_DEC_LOOP
+        if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+            DEBUGLOG(6, "skip fast decode loop");
+            goto safe_decode;
+        }
+
+        /* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
+        while (1) {
+            /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+            assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+            if (endOnInput) assert(ip < iend);
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+
+            assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                variable_length_error error = ok;
+                length += read_variable_length(&ip, iend-RUN_MASK, endOnInput, endOnInput, &error);
+                if (error == initial_error) goto _output_error;
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) goto _output_error;   /* overflow detection */
+                if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) goto _output_error;   /* overflow detection */
+
+                /* copy literals */
+                cpy = op+length;
+                LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+                if (endOnInput) {  /* LZ4_decompress_safe() */
+                    if ((cpy>oend-32) || (ip+length>iend-32)) goto safe_literal_copy;
+                    LZ4_wildCopy32(op, ip, cpy);
+                } else {   /* LZ4_decompress_fast() */
+                    if (cpy>oend-8) goto safe_literal_copy;
+                    LZ4_wildCopy8(op, ip, cpy); /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
+                                                 * it doesn't know input length, and only relies on end-of-block properties */
+                }
+                ip += length; op = cpy;
+            } else {
+                cpy = op+length;
+                if (endOnInput) {  /* LZ4_decompress_safe() */
+                    DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
+                    /* We don't need to check oend, since we check it once for each loop below */
+                    if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) goto safe_literal_copy;
+                    /* Literals can only be 14, but hope compilers optimize if we copy by a register size */
+                    memcpy(op, ip, 16);
+                } else {  /* LZ4_decompress_fast() */
+                    /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
+                     * it doesn't know input length, and relies on end-of-block properties */
+                    memcpy(op, ip, 8);
+                    if (length > 8) memcpy(op+8, ip+8, 8);
+                }
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            match = op - offset;
+
+            /* get matchlength */
+            length = token & ML_MASK;
+
+            if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
+
+            if (length == ML_MASK) {
+              variable_length_error error = ok;
+              length += read_variable_length(&ip, iend - LASTLITERALS + 1, endOnInput, 0, &error);
+              if (error != ok) goto _output_error;
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
+                length += MINMATCH;
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+            } else {
+                length += MINMATCH;
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+
+                /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
+                if (!(dict == usingExtDict) || (match >= lowPrefix)) {
+                    if (offset >= 8) {
+                        memcpy(op, match, 8);
+                        memcpy(op+8, match+8, 8);
+                        memcpy(op+16, match+16, 2);
+                        op += length;
+                        continue;
+            }   }   }
+
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) length = MIN(length, (size_t)(oend-op));
+                    else goto _output_error;   /* doesn't respect parsing restriction */
+                }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) *op++ = *copyFrom++;
+                    } else {
+                        memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+
+            /* copy match within block */
+            cpy = op + length;
+
+            assert((op <= oend) && (oend-op >= 32));
+            if (unlikely(offset<16)) {
+                LZ4_memcpy_using_offset(op, match, cpy, offset);
+            } else {
+                LZ4_wildCopy32(op, match, cpy);
+            }
+
+            op = cpy;   /* wildcopy correction */
+        }
+    safe_decode:
+#endif
+
+        /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */
+        while (1) {
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+
+            assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
+
+            /* A two-stage shortcut for the most common case:
+             * 1) If the literal length is 0..14, and there is enough space,
+             * enter the shortcut and copy 16 bytes on behalf of the literals
+             * (in the fast mode, only 8 bytes can be safely copied this way).
+             * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+             * manner; but we ensure that there's enough space in the output for
+             * those 18 bytes earlier, upon entering the shortcut (in other words,
+             * there is a combined check for both stages).
+             */
+            if ( (endOnInput ? length != RUN_MASK : length <= 8)
+                /* strictly "less than" on input, to re-enter the loop with at least one byte */
+              && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
+                /* Copy the literals */
+                memcpy(op, ip, endOnInput ? 16 : 8);
+                op += length; ip += length;
+
+                /* The second stage: prepare for match copying, decode full info.
+                 * If it doesn't work out, the info won't be wasted. */
+                length = token & ML_MASK; /* match length */
+                offset = LZ4_readLE16(ip); ip += 2;
+                match = op - offset;
+                assert(match <= op); /* check overflow */
+
+                /* Do not deal with overlapping matches. */
+                if ( (length != ML_MASK)
+                  && (offset >= 8)
+                  && (dict==withPrefix64k || match >= lowPrefix) ) {
+                    /* Copy the match. */
+                    memcpy(op + 0, match + 0, 8);
+                    memcpy(op + 8, match + 8, 8);
+                    memcpy(op +16, match +16, 2);
+                    op += length + MINMATCH;
+                    /* Both stages worked, load the next token. */
+                    continue;
+                }
+
+                /* The second stage didn't work out, but the info is ready.
+                 * Propel it right to the point of match copying. */
+                goto _copy_match;
+            }
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+              variable_length_error error = ok;
+              length += read_variable_length(&ip, iend-RUN_MASK, endOnInput, endOnInput, &error);
+              if (error == initial_error) goto _output_error;
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) goto _output_error;   /* overflow detection */
+                if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) goto _output_error;   /* overflow detection */
+            }
+
+            /* copy literals */
+            cpy = op+length;
+#if LZ4_FAST_DEC_LOOP
+        safe_literal_copy:
+#endif
+            LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+            if ( ((endOnInput) && ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) )
+              || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) )
+            {
+                if (partialDecoding) {
+                    if (cpy > oend) { cpy = oend; assert(op<=oend); length = (size_t)(oend-op); }  /* Partial decoding : stop in the middle of literal segment */
+                    if ((endOnInput) && (ip+length > iend)) goto _output_error;   /* Error : read attempt beyond end of input buffer */
+                } else {
+                    if ((!endOnInput) && (cpy != oend)) goto _output_error;       /* Error : block decoding must stop exactly there */
+                    if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   /* Error : input must be consumed */
+                }
+                memcpy(op, ip, length);
+                ip += length;
+                op += length;
+                if (!partialDecoding || (cpy == oend)) {
+                    /* Necessarily EOF, due to parsing restrictions */
+                    break;
+                }
+
+            } else {
+                LZ4_wildCopy8(op, ip, cpy);   /* may overwrite up to WILDCOPYLENGTH beyond cpy */
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            match = op - offset;
+
+            /* get matchlength */
+            length = token & ML_MASK;
+
+    _copy_match:
+            if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
+            if (!partialDecoding) {
+                assert(oend > op);
+                assert(oend - op >= 4);
+                LZ4_write32(op, 0);   /* silence an msan warning when offset==0; costs <1%; */
+            }   /* note : when partialDecoding, there is no guarantee that at least 4 bytes remain available in output buffer */
+
+            if (length == ML_MASK) {
+              variable_length_error error = ok;
+              length += read_variable_length(&ip, iend - LASTLITERALS + 1, endOnInput, 0, &error);
+              if (error != ok) goto _output_error;
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
+            }
+            length += MINMATCH;
+
+#if LZ4_FAST_DEC_LOOP
+        safe_match_copy:
+#endif
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) length = MIN(length, (size_t)(oend-op));
+                    else goto _output_error;   /* doesn't respect parsing restriction */
+                }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) *op++ = *copyFrom++;
+                    } else {
+                        memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+
+            /* copy match within block */
+            cpy = op + length;
+
+            /* partialDecoding : may end anywhere within the block */
+            assert(op<=oend);
+            if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                size_t const mlen = MIN(length, (size_t)(oend-op));
+                const BYTE* const matchEnd = match + mlen;
+                BYTE* const copyEnd = op + mlen;
+                if (matchEnd > op) {   /* overlap copy */
+                    while (op < copyEnd) *op++ = *match++;
+                } else {
+                    memcpy(op, match, mlen);
+                }
+                op = copyEnd;
+                if (op==oend) break;
+                continue;
+            }
+
+            if (unlikely(offset<8)) {
+                op[0] = match[0];
+                op[1] = match[1];
+                op[2] = match[2];
+                op[3] = match[3];
+                match += inc32table[offset];
+                memcpy(op+4, match, 4);
+                match -= dec64table[offset];
+            } else {
+                memcpy(op, match, 8);
+                match += 8;
+            }
+            op += 8;
+
+            if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1);
+                if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
+                if (op < oCopyLimit) {
+                    LZ4_wildCopy8(op, match, oCopyLimit);
+                    match += oCopyLimit - op;
+                    op = oCopyLimit;
+                }
+                while (op < cpy) *op++ = *match++;
+            } else {
+                memcpy(op, match, 8);
+                if (length > 16) LZ4_wildCopy8(op+8, match+8, cpy);
+            }
+            op = cpy;   /* wildcopy correction */
+        }
+
+        /* end of decoding */
+        if (endOnInput)
+           return (int) (((char*)op)-dst);     /* Nb of output bytes decoded */
+        else
+           return (int) (((const char*)ip)-src);   /* Nb of input bytes read */
+
+        /* Overflow error detected */
+    _output_error:
+        return (int) (-(((const char*)ip)-src))-1;
+    }
+}
+
+
+/*===== Instantiate the API decoding functions. =====*/
+
+LZ4_FORCE_O2_GCC_PPC64LE
+int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize,
+                                  endOnInputSize, decode_full_block, noDict,
+                                  (BYTE*)dest, NULL, 0);
+}
+
+LZ4_FORCE_O2_GCC_PPC64LE
+int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
+                                  endOnInputSize, partial_decode,
+                                  noDict, (BYTE*)dst, NULL, 0);
+}
+
+LZ4_FORCE_O2_GCC_PPC64LE
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize,
+                                  endOnOutputSize, decode_full_block, withPrefix64k,
+                                  (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+/*===== Instantiate a few more decoding cases, used more than once. =====*/
+
+LZ4_FORCE_O2_GCC_PPC64LE /* Exported, an obsolete API function. */
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  endOnInputSize, decode_full_block, withPrefix64k,
+                                  (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+/* Another obsolete API function, paired with the previous one. */
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+{
+    /* LZ4_decompress_fast doesn't validate match offsets,
+     * and thus serves well with any prefixed dictionary. */
+    return LZ4_decompress_fast(source, dest, originalSize);
+}
+
+LZ4_FORCE_O2_GCC_PPC64LE
+static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                               size_t prefixSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  endOnInputSize, decode_full_block, noDict,
+                                  (BYTE*)dest-prefixSize, NULL, 0);
+}
+
+LZ4_FORCE_O2_GCC_PPC64LE
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int maxOutputSize,
+                                     const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  endOnInputSize, decode_full_block, usingExtDict,
+                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_O2_GCC_PPC64LE
+static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize,
+                                       const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize,
+                                  endOnOutputSize, decode_full_block, usingExtDict,
+                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+/* The "double dictionary" mode, for use with e.g. ring buffers: the first part
+ * of the dictionary is passed as prefix, and the second via dictStart + dictSize.
+ * These routines are used only once, in LZ4_decompress_*_continue().
+ */
+LZ4_FORCE_INLINE
+int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                   size_t prefixSize, const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  endOnInputSize, decode_full_block, usingExtDict,
+                                  (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_INLINE
+int LZ4_decompress_fast_doubleDict(const char* source, char* dest, int originalSize,
+                                   size_t prefixSize, const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize,
+                                  endOnOutputSize, decode_full_block, usingExtDict,
+                                  (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize);
+}
+
+/*===== streaming decompression functions =====*/
+
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+    LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t));
+    LZ4_STATIC_ASSERT(LZ4_STREAMDECODESIZE >= sizeof(LZ4_streamDecode_t_internal));    /* A compilation error here means LZ4_STREAMDECODESIZE is not large enough */
+    return lz4s;
+}
+
+int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+{
+    if (LZ4_stream == NULL) return 0;   /* support free on NULL */
+    FREEMEM(LZ4_stream);
+    return 0;
+}
+
+/*! LZ4_setStreamDecode() :
+ *  Use this function to instruct where to find the dictionary.
+ *  This function is not necessary if previous data is still available where it was decoded.
+ *  Loading a size of 0 is allowed (same effect as no dictionary).
+ * @return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    lz4sd->prefixSize = (size_t) dictSize;
+    lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
+    lz4sd->externalDict = NULL;
+    lz4sd->extDictSize  = 0;
+    return 1;
+}
+
+/*! LZ4_decoderRingBufferSize() :
+ *  when setting a ring buffer for streaming decompression (optional scenario),
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ *  Note : in a ring buffer scenario,
+ *  blocks are presumed decompressed next to each other.
+ *  When not enough space remains for next block (remainingSize < maxBlockSize),
+ *  decoding resumes from beginning of ring buffer.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+int LZ4_decoderRingBufferSize(int maxBlockSize)
+{
+    if (maxBlockSize < 0) return 0;
+    if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0;
+    if (maxBlockSize < 16) maxBlockSize = 16;
+    return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize);
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where it stands using LZ4_setStreamDecode()
+*/
+LZ4_FORCE_O2_GCC_PPC64LE
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    int result;
+
+    if (lz4sd->prefixSize == 0) {
+        /* The first call, no dictionary yet. */
+        assert(lz4sd->extDictSize == 0);
+        result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)result;
+        lz4sd->prefixEnd = (BYTE*)dest + result;
+    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+        /* They're rolling the current segment. */
+        if (lz4sd->prefixSize >= 64 KB - 1)
+            result = LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
+        else if (lz4sd->extDictSize == 0)
+            result = LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize,
+                                                         lz4sd->prefixSize);
+        else
+            result = LZ4_decompress_safe_doubleDict(source, dest, compressedSize, maxOutputSize,
+                                                    lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += (size_t)result;
+        lz4sd->prefixEnd  += result;
+    } else {
+        /* The buffer wraps around, or they're switching to another buffer. */
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize,
+                                                  lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)result;
+        lz4sd->prefixEnd  = (BYTE*)dest + result;
+    }
+
+    return result;
+}
+
+LZ4_FORCE_O2_GCC_PPC64LE
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    int result;
+    assert(originalSize >= 0);
+
+    if (lz4sd->prefixSize == 0) {
+        assert(lz4sd->extDictSize == 0);
+        result = LZ4_decompress_fast(source, dest, originalSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)originalSize;
+        lz4sd->prefixEnd = (BYTE*)dest + originalSize;
+    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+        if (lz4sd->prefixSize >= 64 KB - 1 || lz4sd->extDictSize == 0)
+            result = LZ4_decompress_fast(source, dest, originalSize);
+        else
+            result = LZ4_decompress_fast_doubleDict(source, dest, originalSize,
+                                                    lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += (size_t)originalSize;
+        lz4sd->prefixEnd  += originalSize;
+    } else {
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_fast_extDict(source, dest, originalSize,
+                                             lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)originalSize;
+        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+    }
+
+    return result;
+}
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+    if (dictStart+dictSize == dest) {
+        if (dictSize >= 64 KB - 1)
+            return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
+        return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, dictSize);
+    }
+    return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+{
+    if (dictSize==0 || dictStart+dictSize == dest)
+        return LZ4_decompress_fast(source, dest, originalSize);
+    return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, dictSize);
+}
+
+
+/*=*************************************************
+*  Obsolete Functions
+***************************************************/
+/* obsolete compression functions */
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_compress_default(source, dest, inputSize, maxOutputSize);
+}
+int LZ4_compress(const char* source, char* dest, int inputSize)
+{
+    return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize));
+}
+int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize)
+{
+    return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1);
+}
+int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize)
+{
+    return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1);
+}
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int dstCapacity)
+{
+    return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, 1);
+}
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize)
+{
+    return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1);
+}
+
+/*
+These decompression functions are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress (const char* source, char* dest, int outputSize)
+{
+    return LZ4_decompress_fast(source, dest, outputSize);
+}
+int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize)
+{
+    return LZ4_decompress_safe(source, dest, isize, maxOutputSize);
+}
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
+
+int LZ4_resetStreamState(void* state, char* inputBuffer)
+{
+    (void)inputBuffer;
+    LZ4_resetStream((LZ4_stream_t*)state);
+    return 0;
+}
+
+void* LZ4_create (char* inputBuffer)
+{
+    (void)inputBuffer;
+    return LZ4_createStream();
+}
+
+char* LZ4_slideInputBuffer (void* state)
+{
+    /* avoid const char * -> char * conversion warning */
+    return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary;
+}
+
+#endif   /* LZ4_COMMONDEFS_ONLY */
+
+}
diff --git a/libs/tracy/common/tracy_lz4.hpp b/libs/tracy/common/tracy_lz4.hpp
@@ -0,0 +1,679 @@
+/*
+ *  LZ4 - Fast LZ compression algorithm
+ *  Header File
+ *  Copyright (C) 2011-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+
+
+#ifndef TRACY_LZ4_H_2983827168210
+#define TRACY_LZ4_H_2983827168210
+
+/* --- Dependency --- */
+#include <stddef.h>   /* size_t */
+#include <stdint.h>
+
+namespace tracy
+{
+
+/**
+  Introduction
+
+  LZ4 is lossless compression algorithm, providing compression speed at 500 MB/s per core,
+  scalable with multi-cores CPU. It features an extremely fast decoder, with speed in
+  multiple GB/s per core, typically reaching RAM speed limits on multi-core systems.
+
+  The LZ4 compression library provides in-memory compression and decompression functions.
+  It gives full buffer control to user.
+  Compression can be done in:
+    - a single step (described as Simple Functions)
+    - a single step, reusing a context (described in Advanced Functions)
+    - unbounded multiple steps (described as Streaming compression)
+
+  lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md).
+  Decompressing a block requires additional metadata, such as its compressed size.
+  Each application is free to encode and pass such metadata in whichever way it wants.
+
+  lz4.h only handle blocks, it can not generate Frames.
+
+  Blocks are different from Frames (doc/lz4_Frame_format.md).
+  Frames bundle both blocks and metadata in a specified manner.
+  This are required for compressed data to be self-contained and portable.
+  Frame format is delivered through a companion API, declared in lz4frame.h.
+  Note that the `lz4` CLI can only manage frames.
+*/
+
+/*^***************************************************************
+*  Export parameters
+*****************************************************************/
+/*
+*  LZ4_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*  LZ4LIB_VISIBILITY :
+*  Control library symbols visibility.
+*/
+#ifndef LZ4LIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define LZ4LIB_VISIBILITY
+#  endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+#  define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+#  define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define LZ4LIB_API LZ4LIB_VISIBILITY
+#endif
+
+/*------   Version   ------*/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR    9    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  1    /* for tweaks, bug-fixes, or development */
+
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+
+#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
+#define LZ4_QUOTE(str) #str
+#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
+#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)
+
+LZ4LIB_API int LZ4_versionNumber (void);  /**< library version number; useful to check dll version */
+LZ4LIB_API const char* LZ4_versionString (void);   /**< library version string; useful to check dll version */
+
+
+/*-************************************
+*  Tuning parameter
+**************************************/
+/*!
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+ * Increasing memory usage improves compression ratio.
+ * Reduced memory usage may improve speed, thanks to better cache locality.
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+ */
+#ifndef LZ4_MEMORY_USAGE
+# define LZ4_MEMORY_USAGE 12
+#endif
+
+
+/*-************************************
+*  Simple Functions
+**************************************/
+/*! LZ4_compress_default() :
+    Compresses 'srcSize' bytes from buffer 'src'
+    into already allocated 'dst' buffer of size 'dstCapacity'.
+    Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize).
+    It also runs faster, so it's a recommended setting.
+    If the function cannot compress 'src' into a more limited 'dst' budget,
+    compression stops *immediately*, and the function result is zero.
+    In which case, 'dst' content is undefined (invalid).
+        srcSize : max supported value is LZ4_MAX_INPUT_SIZE.
+        dstCapacity : size of buffer 'dst' (which must be already allocated)
+       @return  : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity)
+                  or 0 if compression fails
+    Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer).
+*/
+LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity);
+
+/*! LZ4_decompress_safe() :
+    compressedSize : is the exact complete size of the compressed block.
+    dstCapacity : is the size of destination buffer, which must be already allocated.
+   @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+             If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+    Note : This function is protected against malicious data packets (never writes outside 'dst' buffer, nor read outside 'source' buffer).
+*/
+LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity);
+
+
+/*-************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*! LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is incorrect (too large or negative)
+*/
+LZ4LIB_API int LZ4_compressBound(int inputSize);
+
+/*! LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows selection of "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by ACCELERATION_DEFAULT (currently == 1, see lz4.c).
+*/
+LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_fast_extState() :
+ *  Same as LZ4_compress_fast(), using an externally allocated memory space for its state.
+ *  Use LZ4_sizeofState() to know how much memory must be allocated,
+ *  and allocate it on 8-bytes boundaries (using `malloc()` typically).
+ *  Then, provide this buffer as `void* state` to compression function.
+ */
+LZ4LIB_API int LZ4_sizeofState(void);
+LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_destSize() :
+ *  Reverse the logic : compresses as much data as possible from 'src' buffer
+ *  into already allocated buffer 'dst', of size >= 'targetDestSize'.
+ *  This function either compresses the entire 'src' content into 'dst' if it's large enough,
+ *  or fill 'dst' buffer completely with as much data as possible from 'src'.
+ *  note: acceleration parameter is fixed to "default".
+ *
+ * *srcSizePtr : will be modified to indicate how many bytes where read from 'src' to fill 'dst'.
+ *               New value is necessarily <= input value.
+ * @return : Nb bytes written into 'dst' (necessarily <= targetDestSize)
+ *           or 0 if compression fails.
+*/
+LZ4LIB_API int LZ4_compress_destSize (const char* src, char* dst, int* srcSizePtr, int targetDstSize);
+
+
+/*! LZ4_decompress_safe_partial() :
+ *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
+ *  into destination buffer 'dst' of size 'dstCapacity'.
+ *  Up to 'targetOutputSize' bytes will be decoded.
+ *  The function stops decoding on reaching this objective,
+ *  which can boost performance when only the beginning of a block is required.
+ *
+ * @return : the number of bytes decoded in `dst` (necessarily <= dstCapacity)
+ *           If source stream is detected malformed, function returns a negative result.
+ *
+ *  Note : @return can be < targetOutputSize, if compressed block contains less data.
+ *
+ *  Note 2 : this function features 2 parameters, targetOutputSize and dstCapacity,
+ *           and expects targetOutputSize <= dstCapacity.
+ *           It effectively stops decoding on reaching targetOutputSize,
+ *           so dstCapacity is kind of redundant.
+ *           This is because in a previous version of this function,
+ *           decoding operation would not "break" a sequence in the middle.
+ *           As a consequence, there was no guarantee that decoding would stop at exactly targetOutputSize,
+ *           it could write more bytes, though only up to dstCapacity.
+ *           Some "margin" used to be required for this operation to work properly.
+ *           This is no longer necessary.
+ *           The function nonetheless keeps its signature, in an effort to not break API.
+ */
+LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity);
+
+
+/*-*********************************************
+*  Streaming Compression Functions
+***********************************************/
+typedef union LZ4_stream_u LZ4_stream_t;  /* incomplete type (defined later) */
+
+LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
+LZ4LIB_API int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+
+/*! LZ4_resetStream_fast() : v1.9.0+
+ *  Use this to prepare an LZ4_stream_t for a new chain of dependent blocks
+ *  (e.g., LZ4_compress_fast_continue()).
+ *
+ *  An LZ4_stream_t must be initialized once before usage.
+ *  This is automatically done when created by LZ4_createStream().
+ *  However, should the LZ4_stream_t be simply declared on stack (for example),
+ *  it's necessary to initialize it first, using LZ4_initStream().
+ *
+ *  After init, start any new stream with LZ4_resetStream_fast().
+ *  A same LZ4_stream_t can be re-used multiple times consecutively
+ *  and compress multiple streams,
+ *  provided that it starts each new stream with LZ4_resetStream_fast().
+ *
+ *  LZ4_resetStream_fast() is much faster than LZ4_initStream(),
+ *  but is not compatible with memory regions containing garbage data.
+ *
+ *  Note: it's only useful to call LZ4_resetStream_fast()
+ *        in the context of streaming compression.
+ *        The *extState* functions perform their own resets.
+ *        Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive.
+ */
+LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr);
+
+/*! LZ4_loadDict() :
+ *  Use this function to reference a static dictionary into LZ4_stream_t.
+ *  The dictionary must remain available during compression.
+ *  LZ4_loadDict() triggers a reset, so any previous data will be forgotten.
+ *  The same dictionary will have to be loaded on decompression side for successful decoding.
+ *  Dictionary are useful for better compression of small data (KB range).
+ *  While LZ4 accept any input as dictionary,
+ *  results are generally better when using Zstandard's Dictionary Builder.
+ *  Loading a size of 0 is allowed, and is the same as reset.
+ * @return : loaded dictionary size, in bytes (necessarily <= 64 KB)
+ */
+LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_compress_fast_continue() :
+ *  Compress 'src' content using data from previously compressed blocks, for better compression ratio.
+ * 'dst' buffer must be already allocated.
+ *  If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ *
+ * @return : size of compressed block
+ *           or 0 if there is an error (typically, cannot fit into 'dst').
+ *
+ *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block.
+ *           Each block has precise boundaries.
+ *           Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata.
+ *           It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together.
+ *
+ *  Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory !
+ *
+ *  Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB.
+ *           Make sure that buffers are separated, by at least one byte.
+ *           This construction ensures that each block only depends on previous block.
+ *
+ *  Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB.
+ *
+ *  Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed.
+ */
+LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_saveDict() :
+ *  If last 64KB data cannot be guaranteed to remain available at its current memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  This is schematically equivalent to a memcpy() followed by LZ4_loadDict(),
+ *  but is much faster, because LZ4_saveDict() doesn't need to rebuild tables.
+ * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error.
+ */
+LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize);
+
+
+/*-**********************************************
+*  Streaming Decompression Functions
+*  Bufferless synchronous API
+************************************************/
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
+
+/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
+ *  creation / destruction of streaming decompression tracking context.
+ *  A tracking context can be re-used multiple times.
+ */
+LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+
+/*! LZ4_setStreamDecode() :
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
+ *  Use this function to start decompression of a new stream of blocks.
+ *  A dictionary can optionally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
+ * @return : 1 if OK, 0 if error
+ */
+LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*! LZ4_decoderRingBufferSize() : v1.8.2+
+ *  Note : in a ring buffer scenario (optional),
+ *  blocks are presumed decompressed next to each other
+ *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+ *  at which stage it resumes from beginning of ring buffer.
+ *  When setting such a ring buffer for streaming decompression,
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize))  /* for static allocation; maxBlockSize presumed valid */
+
+/*! LZ4_decompress_*_continue() :
+ *  These decoding functions allow decompression of consecutive blocks in "streaming" mode.
+ *  A block is an unsplittable entity, it must be presented entirely to a decompression function.
+ *  Decompression functions only accepts one block at a time.
+ *  The last 64KB of previously decoded data *must* remain available and unmodified at the memory position where they were decoded.
+ *  If less than 64KB of data has been decoded, all the data must be present.
+ *
+ *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+ *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+ *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized.
+ *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+ *  - Synchronized mode :
+ *    Decompression buffer size is _exactly_ the same as compression buffer size,
+ *    and follows exactly same update rule (block boundaries at same positions),
+ *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+ *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized,
+ *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *
+ *  Whenever these conditions are not possible,
+ *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+ *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
+*/
+LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity);
+
+
+/*! LZ4_decompress_*_usingDict() :
+ *  These decoding functions work the same as
+ *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue()
+ *  They are stand-alone, and don't need an LZ4_streamDecode_t structure.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during decompression.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize);
+
+
+/*^*************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***************************************/
+
+/*-****************************************************************************
+ * Experimental section
+ *
+ * Symbols declared in this section must be considered unstable. Their
+ * signatures or semantics may change, or they may be removed altogether in the
+ * future. They are therefore only safe to depend on when the caller is
+ * statically linked against the library.
+ *
+ * To protect against unsafe usage, not only are the declarations guarded,
+ * the definitions are hidden by default
+ * when building LZ4 as a shared/dynamic library.
+ *
+ * In order to access these declarations,
+ * define LZ4_STATIC_LINKING_ONLY in your application
+ * before including LZ4's headers.
+ *
+ * In order to make their implementations accessible dynamically, you must
+ * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library.
+ ******************************************************************************/
+
+#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS
+#define LZ4LIB_STATIC_API LZ4LIB_API
+#else
+#define LZ4LIB_STATIC_API
+#endif
+
+#ifdef LZ4_STATIC_LINKING_ONLY
+
+
+/*! LZ4_compress_fast_extState_fastReset() :
+ *  A variant of LZ4_compress_fast_extState().
+ *
+ *  Using this variant avoids an expensive initialization step.
+ *  It is only safe to call if the state buffer is known to be correctly initialized already
+ *  (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized").
+ *  From a high level, the difference is that
+ *  this function initializes the provided state with a call to something like LZ4_resetStream_fast()
+ *  while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_attach_dictionary() :
+ *  This is an experimental API that allows
+ *  efficient use of a static dictionary many times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
+ *  working LZ4_stream_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references the dictionary stream in-place.
+ *
+ *  Several assumptions are made about the state of the dictionary stream.
+ *  Currently, only streams which have been prepared by LZ4_loadDict() should
+ *  be expected to work.
+ *
+ *  Alternatively, the provided dictionaryStream may be NULL,
+ *  in which case any existing dictionary stream is unset.
+ *
+ *  If a dictionary is provided, it replaces any pre-existing stream history.
+ *  The dictionary contents are the only history that can be referenced and
+ *  logically immediately precede the data compressed in the first subsequent
+ *  compression call.
+ *
+ *  The dictionary will only remain attached to the working stream through the
+ *  first compression call, at the end of which it is cleared. The dictionary
+ *  stream (and source buffer) must remain in-place / accessible / unchanged
+ *  through the completion of the first compression call on the stream.
+ */
+LZ4LIB_STATIC_API void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream);
+
+#endif
+
+
+/*-************************************************************
+ *  PRIVATE DEFINITIONS
+ **************************************************************
+ * Do not use these definitions directly.
+ * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`.
+ * Accessing members will expose code to API and/or ABI break in future versions of the library.
+ **************************************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#include <stdint.h>
+
+typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
+struct LZ4_stream_t_internal {
+    uint32_t hashTable[LZ4_HASH_SIZE_U32];
+    uint32_t currentOffset;
+    uint16_t dirty;
+    uint16_t tableType;
+    const uint8_t* dictionary;
+    const LZ4_stream_t_internal* dictCtx;
+    uint32_t dictSize;
+};
+
+typedef struct {
+    const uint8_t* externalDict;
+    size_t extDictSize;
+    const uint8_t* prefixEnd;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+#else
+
+typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
+struct LZ4_stream_t_internal {
+    unsigned int hashTable[LZ4_HASH_SIZE_U32];
+    unsigned int currentOffset;
+    unsigned short dirty;
+    unsigned short tableType;
+    const unsigned char* dictionary;
+    const LZ4_stream_t_internal* dictCtx;
+    unsigned int dictSize;
+};
+
+typedef struct {
+    const unsigned char* externalDict;
+    const unsigned char* prefixEnd;
+    size_t extDictSize;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+#endif
+
+/*! LZ4_stream_t :
+ *  information structure to track an LZ4 stream.
+ *  LZ4_stream_t can also be created using LZ4_createStream(), which is recommended.
+ *  The structure definition can be convenient for static allocation
+ *  (on stack, or as part of larger structure).
+ *  Init this structure with LZ4_initStream() before first use.
+ *  note : only use this definition in association with static linking !
+ *    this definition is not API/ABI safe, and may change in a future version.
+ */
+#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4 + ((sizeof(void*)==16) ? 4 : 0) /*AS-400*/ )
+#define LZ4_STREAMSIZE     (LZ4_STREAMSIZE_U64 * sizeof(unsigned long long))
+union LZ4_stream_u {
+    unsigned long long table[LZ4_STREAMSIZE_U64];
+    LZ4_stream_t_internal internal_donotuse;
+} ;  /* previously typedef'd to LZ4_stream_t */
+
+/*! LZ4_initStream() : v1.9.0+
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is automatically done when invoking LZ4_createStream(),
+ *  but it's not when the structure is simply declared on stack (for example).
+ *
+ *  Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t.
+ *  It can also initialize any arbitrary buffer of sufficient size,
+ *  and will @return a pointer of proper type upon initialization.
+ *
+ *  Note : initialization fails if size and alignment conditions are not respected.
+ *         In which case, the function will @return NULL.
+ *  Note2: An LZ4_stream_t structure guarantees correct alignment and size.
+ *  Note3: Before v1.9.0, use LZ4_resetStream() instead
+ */
+LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* buffer, size_t size);
+
+
+/*! LZ4_streamDecode_t :
+ *  information structure to track an LZ4 stream during decompression.
+ *  init this structure  using LZ4_setStreamDecode() before first use.
+ *  note : only use in association with static linking !
+ *         this definition is not API/ABI safe,
+ *         and may change in a future version !
+ */
+#define LZ4_STREAMDECODESIZE_U64 (4 + ((sizeof(void*)==16) ? 2 : 0) /*AS-400*/ )
+#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
+union LZ4_streamDecode_u {
+    unsigned long long table[LZ4_STREAMDECODESIZE_U64];
+    LZ4_streamDecode_t_internal internal_donotuse;
+} ;   /* previously typedef'd to LZ4_streamDecode_t */
+
+
+/*-************************************
+*  Obsolete Functions
+**************************************/
+
+/*! Deprecation warnings
+ *
+ *  Deprecated functions make the compiler generate a warning when invoked.
+ *  This is meant to invite users to update their source code.
+ *  Should deprecation warnings be a problem, it is generally possible to disable them,
+ *  typically with -Wno-deprecated-declarations for gcc
+ *  or _CRT_SECURE_NO_WARNINGS in Visual.
+ *
+ *  Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS
+ *  before including the header file.
+ */
+#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4_DEPRECATED(message)   /* disable deprecation warnings */
+#else
+#  define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define LZ4_DEPRECATED(message) [[deprecated(message)]]
+#  elif (LZ4_GCC_VERSION >= 405) || defined(__clang__)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (LZ4_GCC_VERSION >= 301)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler")
+#    define LZ4_DEPRECATED(message)
+#  endif
+#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
+
+/* Obsolete compression functions */
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress               (const char* source, char* dest, int sourceSize);
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/* Obsolete decompression functions */
+LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize);
+LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize);
+
+/* Obsolete streaming functions; degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, they don't
+ * actually retain any history between compression calls. The compression ratio
+ * achieved will therefore be no better than compressing each chunk
+ * independently.
+ */
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("Use LZ4_resetStream() instead")  LZ4LIB_API int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_saveDict() instead")     LZ4LIB_API char* LZ4_slideInputBuffer (void* state);
+
+/* Obsolete streaming decoding functions */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+/*! LZ4_decompress_fast() : **unsafe!**
+ *  These functions used to be faster than LZ4_decompress_safe(),
+ *  but it has changed, and they are now slower than LZ4_decompress_safe().
+ *  This is because LZ4_decompress_fast() doesn't know the input size,
+ *  and therefore must progress more cautiously in the input buffer to not read beyond the end of block.
+ *  On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability.
+ *  As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated.
+ *
+ *  The last remaining LZ4_decompress_fast() specificity is that
+ *  it can decompress a block without knowing its compressed size.
+ *  Such functionality could be achieved in a more secure manner,
+ *  by also providing the maximum size of input buffer,
+ *  but it would require new prototypes, and adaptation of the implementation to this new use case.
+ *
+ *  Parameters:
+ *  originalSize : is the uncompressed size to regenerate.
+ *                 `dst` must be already allocated, its size must be >= 'originalSize' bytes.
+ * @return : number of bytes read from source buffer (== compressed size).
+ *           The function expects to finish at block's end exactly.
+ *           If the source stream is detected malformed, the function stops decoding and returns a negative result.
+ *  note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer.
+ *         However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds.
+ *         Also, since match offsets are not validated, match reads from 'src' may underflow too.
+ *         These issues never happen if input (compressed) data is correct.
+ *         But they may happen if input data is invalid (error or intentional tampering).
+ *         As a consequence, use these functions in trusted environments with trusted data **only**.
+ */
+
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe() instead")
+LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_continue() instead")
+LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
+
+/*! LZ4_resetStream() :
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is done with LZ4_initStream(), or LZ4_resetStream().
+ *  Consider switching to LZ4_initStream(),
+ *  invoking LZ4_resetStream() will trigger deprecation warnings in the future.
+ */
+LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+}
+
+#endif /* LZ4_H_2983827168210 */
diff --git a/libs/tracy/common/tracy_sema.h b/libs/tracy/common/tracy_sema.h
@@ -0,0 +1,255 @@
+// Copyright (c) 2015 Jeff Preshing
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+// claim that you wrote the original software. If you use this software
+// in a product, an acknowledgement in the product documentation would be
+// appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+// misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#ifndef __TRACY_CPP11OM_SEMAPHORE_H__
+#define __TRACY_CPP11OM_SEMAPHORE_H__
+
+#include <atomic>
+#include <cassert>
+
+#if defined(__MACH__)
+    #include <mach/mach.h>
+#elif defined(__unix__)
+    #include <semaphore.h>
+#endif
+
+namespace tracy
+{
+
+#if defined(_WIN32)
+//---------------------------------------------------------
+// Semaphore (Windows)
+//---------------------------------------------------------
+#ifndef MAXLONG
+enum { MAXLONG = 0x7fffffff };
+#endif
+
+#ifndef INFINITE
+enum { INFINITE = 0xFFFFFFFF };
+#endif
+
+#ifndef _WINDOWS_
+typedef void* HANDLE;
+
+extern "C" __declspec(dllimport) HANDLE __stdcall CreateSemaphoreA( void*, long, long, const char* );
+extern "C" __declspec(dllimport) int __stdcall CloseHandle( HANDLE );
+extern "C" __declspec(dllimport) unsigned long __stdcall WaitForSingleObject( HANDLE, unsigned long );
+extern "C" __declspec(dllimport) int __stdcall ReleaseSemaphore( HANDLE, long, long* );
+#endif
+
+class Semaphore
+{
+private:
+    HANDLE m_hSema;
+
+    Semaphore(const Semaphore& other) = delete;
+    Semaphore& operator=(const Semaphore& other) = delete;
+
+public:
+    Semaphore(int initialCount = 0)
+    {
+        assert(initialCount >= 0);
+        m_hSema = CreateSemaphoreA(NULL, initialCount, MAXLONG, NULL);
+    }
+
+    ~Semaphore()
+    {
+        CloseHandle(m_hSema);
+    }
+
+    void wait()
+    {
+        WaitForSingleObject(m_hSema, INFINITE);
+    }
+
+    void signal(int count = 1)
+    {
+        ReleaseSemaphore(m_hSema, count, NULL);
+    }
+};
+
+
+#elif defined(__MACH__)
+//---------------------------------------------------------
+// Semaphore (Apple iOS and OSX)
+// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+//---------------------------------------------------------
+
+class Semaphore
+{
+private:
+    semaphore_t m_sema;
+
+    Semaphore(const Semaphore& other) = delete;
+    Semaphore& operator=(const Semaphore& other) = delete;
+
+public:
+    Semaphore(int initialCount = 0)
+    {
+        assert(initialCount >= 0);
+        semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+    }
+
+    ~Semaphore()
+    {
+        semaphore_destroy(mach_task_self(), m_sema);
+    }
+
+    void wait()
+    {
+        semaphore_wait(m_sema);
+    }
+
+    void signal()
+    {
+        semaphore_signal(m_sema);
+    }
+
+    void signal(int count)
+    {
+        while (count-- > 0)
+        {
+            semaphore_signal(m_sema);
+        }
+    }
+};
+
+
+#elif defined(__unix__)
+//---------------------------------------------------------
+// Semaphore (POSIX, Linux)
+//---------------------------------------------------------
+
+class Semaphore
+{
+private:
+    sem_t m_sema;
+
+    Semaphore(const Semaphore& other) = delete;
+    Semaphore& operator=(const Semaphore& other) = delete;
+
+public:
+    Semaphore(int initialCount = 0)
+    {
+        assert(initialCount >= 0);
+        sem_init(&m_sema, 0, initialCount);
+    }
+
+    ~Semaphore()
+    {
+        sem_destroy(&m_sema);
+    }
+
+    void wait()
+    {
+        // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+        int rc;
+        do
+        {
+            rc = sem_wait(&m_sema);
+        }
+        while (rc == -1 && errno == EINTR);
+    }
+
+    void signal()
+    {
+        sem_post(&m_sema);
+    }
+
+    void signal(int count)
+    {
+        while (count-- > 0)
+        {
+            sem_post(&m_sema);
+        }
+    }
+};
+
+
+#else
+
+#error Unsupported platform!
+
+#endif
+
+
+//---------------------------------------------------------
+// LightweightSemaphore
+//---------------------------------------------------------
+class LightweightSemaphore
+{
+private:
+    std::atomic<int> m_count;
+    Semaphore m_sema;
+
+    void waitWithPartialSpinning()
+    {
+        int oldCount;
+        // Is there a better way to set the initial spin count?
+        // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC,
+        // as threads start hitting the kernel semaphore.
+        int spin = 10000;
+        while (spin--)
+        {
+            oldCount = m_count.load(std::memory_order_relaxed);
+            if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire))
+                return;
+            std::atomic_signal_fence(std::memory_order_acquire);     // Prevent the compiler from collapsing the loop.
+        }
+        oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+        if (oldCount <= 0)
+        {
+            m_sema.wait();
+        }
+    }
+
+public:
+    LightweightSemaphore(int initialCount = 0) : m_count(initialCount)
+    {
+        assert(initialCount >= 0);
+    }
+
+    bool tryWait()
+    {
+        int oldCount = m_count.load(std::memory_order_relaxed);
+        return (oldCount > 0 && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire));
+    }
+
+    void wait()
+    {
+        if (!tryWait())
+            waitWithPartialSpinning();
+    }
+
+    void signal(int count = 1)
+    {
+        int oldCount = m_count.fetch_add(count, std::memory_order_release);
+        int toRelease = -oldCount < count ? -oldCount : count;
+        if (toRelease > 0)
+        {
+            m_sema.signal(toRelease);
+        }
+    }
+};
+
+
+typedef LightweightSemaphore DefaultSemaphoreType;
+
+}
+
+#endif // __CPP11OM_SEMAPHORE_H__
diff --git a/make.lua b/make.lua
@@ -1,5 +1,7 @@
 require( "ggbuild.gen_ninja" )
 
+require( "libs.tracy" )
+
 obj_cxxflags( ".*", "-I source -I libs" )
 
 msvc_obj_cxxflags( ".*", "/W4 /wd4100 /wd4146 /wd4189 /wd4201 /wd4307 /wd4324 /wd4351 /wd4127 /wd4505 /wd4530 /wd4702 /wd4706 /D_CRT_SECURE_NO_WARNINGS" )
@@ -11,6 +13,10 @@ gcc_obj_cxxflags( ".*", "-Wall -Wextra -Wcast-align -Wvla -Wformat-security" ) -
 gcc_obj_cxxflags( ".*", "-Wno-unused-parameter -Wno-missing-field-initializers -Wno-implicit-fallthrough -Wno-format-truncation" )
 gcc_obj_cxxflags( ".*", "-Werror=vla -Werror=format-security -Werror=unused-value" )
 
+if config ~= "release" then
+	obj_cxxflags( ".*", "-DTRACY_ENABLE" )
+end
+
 local platform_srcs, platform_libs
 
 if OS == "windows" then
@@ -31,7 +37,7 @@ bin( "mudgangster", {
 		"src/ui.cc", "src/script.cc", "src/textbox.cc", "src/input.cc", "src/platform_network.cc",
 	},
 
-	libs = platform_libs,
+	libs = { platform_libs, "tracy" },
 
 	rc = "src/rc",
 
diff --git a/src/common.h b/src/common.h
@@ -9,6 +9,8 @@
 
 #include "config.h"
 
+#include "tracy/Tracy.hpp"
+
 typedef uint8_t u8;
 typedef uint16_t u16;
 typedef uint32_t u32;

	mudgangster Tiny, scriptable MUD client
	Log \| Files \| Refs \| README

M	ggbuild/gen_ninja.lua	\|	2	+-
A	libs/tracy.lua	\|	3	+++
A	libs/tracy/LICENSE	\|	27	+++++++++++++++++++++++++++
A	libs/tracy/Tracy.hpp	\|	177	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/TracyC.h	\|	188	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/TracyClient.cpp	\|	43	+++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/TracyOpenGL.hpp	\|	272	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracyArmCpuTable.hpp	\|	319	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracyCallstack.cpp	\|	603	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracyCallstack.h	\|	29	+++++++++++++++++++++++++++++
A	libs/tracy/client/TracyCallstack.hpp	\|	112	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracyDxt1.cpp	\|	646	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracyDxt1.hpp	\|	11	+++++++++++
A	libs/tracy/client/TracyFastVector.hpp	\|	116	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracyLock.hpp	\|	527	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracyProfiler.cpp	\|	2729	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracyProfiler.hpp	\|	620	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracyScoped.hpp	\|	119	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracySysTime.cpp	\|	105	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracySysTime.hpp	\|	36	++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracySysTrace.cpp	\|	862	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracySysTrace.hpp	\|	25	+++++++++++++++++++++++++
A	libs/tracy/client/TracySysTracePayload.hpp	\|	80	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/TracyThread.hpp	\|	70	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/tracy_concurrentqueue.h	\|	1552	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/tracy_rpmalloc.cpp	\|	2099	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/client/tracy_rpmalloc.hpp	\|	153	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/TracyAlign.hpp	\|	27	+++++++++++++++++++++++++++
A	libs/tracy/common/TracyAlloc.hpp	\|	33	+++++++++++++++++++++++++++++++++
A	libs/tracy/common/TracyApi.h	\|	14	++++++++++++++
A	libs/tracy/common/TracyColor.hpp	\|	690	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/TracyForceInline.hpp	\|	20	++++++++++++++++++++
A	libs/tracy/common/TracyMutex.hpp	\|	33	+++++++++++++++++++++++++++++++++
A	libs/tracy/common/TracyProtocol.hpp	\|	103	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/TracyQueue.hpp	\|	500	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/TracySocket.cpp	\|	561	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/TracySocket.hpp	\|	131	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/TracySystem.cpp	\|	187	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/TracySystem.hpp	\|	80	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/tracy_benaphore.h	\|	68	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/tracy_lz4.cpp	\|	2297	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/tracy_lz4.hpp	\|	679	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	libs/tracy/common/tracy_sema.h	\|	255	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	make.lua	\|	8	+++++++-
M	src/common.h	\|	2	++