mudgangster

Tiny, scriptable MUD client
Log | Files | Refs | README

commit 33577cc2ae781b48b73a27381c2c5cef48ceac62
parent 7ee87faf62aa5ae0407ec328c8ba33a82aca6299
Author: Michael Savage <mikejsavage@gmail.com>
Date:   Sun,  3 May 2020 01:03:43 +0300

Add tracy

Diffstat:
Mggbuild/gen_ninja.lua | 2+-
Alibs/tracy.lua | 3+++
Alibs/tracy/LICENSE | 27+++++++++++++++++++++++++++
Alibs/tracy/Tracy.hpp | 177+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/TracyC.h | 188+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/TracyClient.cpp | 43+++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/TracyOpenGL.hpp | 272+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracyArmCpuTable.hpp | 319+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracyCallstack.cpp | 603+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracyCallstack.h | 29+++++++++++++++++++++++++++++
Alibs/tracy/client/TracyCallstack.hpp | 112+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracyDxt1.cpp | 646+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracyDxt1.hpp | 11+++++++++++
Alibs/tracy/client/TracyFastVector.hpp | 116+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracyLock.hpp | 527+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracyProfiler.cpp | 2729+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracyProfiler.hpp | 620+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracyScoped.hpp | 119+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracySysTime.cpp | 105+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracySysTime.hpp | 36++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracySysTrace.cpp | 862+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracySysTrace.hpp | 25+++++++++++++++++++++++++
Alibs/tracy/client/TracySysTracePayload.hpp | 80+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/TracyThread.hpp | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/tracy_concurrentqueue.h | 1552+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/tracy_rpmalloc.cpp | 2099+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/client/tracy_rpmalloc.hpp | 153+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/TracyAlign.hpp | 27+++++++++++++++++++++++++++
Alibs/tracy/common/TracyAlloc.hpp | 33+++++++++++++++++++++++++++++++++
Alibs/tracy/common/TracyApi.h | 14++++++++++++++
Alibs/tracy/common/TracyColor.hpp | 690+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/TracyForceInline.hpp | 20++++++++++++++++++++
Alibs/tracy/common/TracyMutex.hpp | 33+++++++++++++++++++++++++++++++++
Alibs/tracy/common/TracyProtocol.hpp | 103+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/TracyQueue.hpp | 500+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/TracySocket.cpp | 561+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/TracySocket.hpp | 131+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/TracySystem.cpp | 187+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/TracySystem.hpp | 80+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/tracy_benaphore.h | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/tracy_lz4.cpp | 2297+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/tracy_lz4.hpp | 679+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibs/tracy/common/tracy_sema.h | 255+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mmake.lua | 8+++++++-
Msrc/common.h | 2++
45 files changed, 17211 insertions(+), 2 deletions(-)

diff --git a/ggbuild/gen_ninja.lua b/ggbuild/gen_ninja.lua @@ -318,7 +318,7 @@ end local function rule_for_src( src_name ) local ext = src_name:match( "([^%.]+)$" ) - return ( { cc = "cpp" } )[ ext ] + return ( { cc = "cpp", cpp = "cpp" } )[ ext ] end local function write_ninja_script() diff --git a/libs/tracy.lua b/libs/tracy.lua @@ -0,0 +1,3 @@ +lib( "tracy", { "libs/tracy/TracyClient.cpp" } ) +msvc_obj_cxxflags( "libs/tracy/TracyClient.cpp", "/O2" ) +gcc_obj_cxxflags( "libs/tracy/TracyClient.cpp", "-O2 -Wno-unused-function -Wno-maybe-uninitialized" ) diff --git a/libs/tracy/LICENSE b/libs/tracy/LICENSE @@ -0,0 +1,27 @@ +Tracy Profiler (https://bitbucket.org/wolfpld/tracy) is licensed under the +3-clause BSD license. + +Copyright (c) 2017-2019, Bartosz Taudul <wolf.pld@gmail.com> +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the <organization> nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/libs/tracy/Tracy.hpp b/libs/tracy/Tracy.hpp @@ -0,0 +1,177 @@ +#ifndef __TRACY_HPP__ +#define __TRACY_HPP__ + +#include "common/TracyColor.hpp" +#include "common/TracySystem.hpp" + +#ifndef TRACY_ENABLE + +#define ZoneNamed(x,y) +#define ZoneNamedN(x,y,z) +#define ZoneNamedC(x,y,z) +#define ZoneNamedNC(x,y,z,w) + +#define ZoneScoped +#define ZoneScopedN(x) +#define ZoneScopedC(x) +#define ZoneScopedNC(x,y) + +#define ZoneText(x,y) +#define ZoneName(x,y) + +#define FrameMark +#define FrameMarkNamed(x) +#define FrameMarkStart(x) +#define FrameMarkEnd(x) + +#define FrameImage(x,y,z,w,a) + +#define TracyLockable( type, varname ) type varname; +#define TracyLockableN( type, varname, desc ) type varname; +#define TracySharedLockable( type, varname ) type varname; +#define TracySharedLockableN( type, varname, desc ) type varname; +#define LockableBase( type ) type +#define SharedLockableBase( type ) type +#define LockMark(x) (void)x; + +#define TracyPlot(x,y) +#define TracyPlotConfig(x,y) + +#define TracyMessage(x,y) +#define TracyMessageL(x) +#define TracyMessageC(x,y,z) +#define TracyMessageLC(x,y) +#define TracyAppInfo(x,y) + +#define TracyAlloc(x,y) +#define TracyFree(x) + +#define ZoneNamedS(x,y,z) +#define ZoneNamedNS(x,y,z,w) +#define ZoneNamedCS(x,y,z,w) +#define ZoneNamedNCS(x,y,z,w,a) + +#define ZoneScopedS(x) +#define ZoneScopedNS(x,y) +#define ZoneScopedCS(x,y) +#define ZoneScopedNCS(x,y,z) + +#define TracyAllocS(x,y,z) +#define TracyFreeS(x,y) + +#define TracyMessageS(x,y,z) +#define TracyMessageLS(x,y) +#define TracyMessageCS(x,y,z,w) +#define TracyMessageLCS(x,y,z) + +#define TracyParameterRegister(x) +#define TracyParameterSetup(x,y,z,w) + +#else + +#include "client/TracyLock.hpp" +#include "client/TracyProfiler.hpp" +#include "client/TracyScoped.hpp" + +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define ZoneNamed( varname, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define ZoneNamedN( varname, name, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define ZoneNamedC( varname, color, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define ZoneNamedNC( varname, name, color, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +#else +# define ZoneNamed( varname, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); +# define ZoneNamedN( varname, name, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); +# define ZoneNamedC( varname, color, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); +# define ZoneNamedNC( varname, name, color, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); +#endif + +#define ZoneScoped ZoneNamed( ___tracy_scoped_zone, true ) +#define ZoneScopedN( name ) ZoneNamedN( ___tracy_scoped_zone, name, true ) +#define ZoneScopedC( color ) ZoneNamedC( ___tracy_scoped_zone, color, true ) +#define ZoneScopedNC( name, color ) ZoneNamedNC( ___tracy_scoped_zone, name, color, true ) + +#define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size ); +#define ZoneName( txt, size ) ___tracy_scoped_zone.Name( txt, size ); + +#define FrameMark tracy::Profiler::SendFrameMark( nullptr ); +#define FrameMarkNamed( name ) tracy::Profiler::SendFrameMark( name ); +#define FrameMarkStart( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart ); +#define FrameMarkEnd( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd ); + +#define FrameImage( image, width, height, offset, flip ) tracy::Profiler::SendFrameImage( image, width, height, offset, flip ); + +#define TracyLockable( type, varname ) tracy::Lockable<type> varname { [] () -> const tracy::SourceLocationData* { static const tracy::SourceLocationData srcloc { nullptr, #type " " #varname, __FILE__, __LINE__, 0 }; return &srcloc; }() }; +#define TracyLockableN( type, varname, desc ) tracy::Lockable<type> varname { [] () -> const tracy::SourceLocationData* { static const tracy::SourceLocationData srcloc { nullptr, desc, __FILE__, __LINE__, 0 }; return &srcloc; }() }; +#define TracySharedLockable( type, varname ) tracy::SharedLockable<type> varname { [] () -> const tracy::SourceLocationData* { static const tracy::SourceLocationData srcloc { nullptr, #type " " #varname, __FILE__, __LINE__, 0 }; return &srcloc; }() }; +#define TracySharedLockableN( type, varname, desc ) tracy::SharedLockable<type> varname { [] () -> const tracy::SourceLocationData* { static const tracy::SourceLocationData srcloc { nullptr, desc, __FILE__, __LINE__, 0 }; return &srcloc; }() }; +#define LockableBase( type ) tracy::Lockable<type> +#define SharedLockableBase( type ) tracy::SharedLockable<type> +#define LockMark( varname ) static const tracy::SourceLocationData __tracy_lock_location_##varname { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; varname.Mark( &__tracy_lock_location_##varname ); + +#define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val ); +#define TracyPlotConfig( name, type ) tracy::Profiler::ConfigurePlot( name, type ); + +#define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size ); + +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK ); +# define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK ); +# define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK ); +# define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK ); + +# define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK ); +# define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK ); +#else +# define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, 0 ); +# define TracyMessageL( txt ) tracy::Profiler::Message( txt, 0 ); +# define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, 0 ); +# define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, 0 ); + +# define TracyAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size ); +# define TracyFree( ptr ) tracy::Profiler::MemFree( ptr ); +#endif + +#ifdef TRACY_HAS_CALLSTACK +# define ZoneNamedS( varname, depth, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); +# define ZoneNamedNS( varname, name, depth, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); +# define ZoneNamedCS( varname, color, depth, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); +# define ZoneNamedNCS( varname, name, color, depth, active ) static const tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); + +# define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true ) +# define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true ) +# define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true ) +# define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color depth, true ) + +# define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth ); +# define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth ); + +# define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth ); +# define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth ); +# define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth ); +# define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth ); +#else +# define ZoneNamedS( varname, depth, active ) ZoneNamed( varname, active ) +# define ZoneNamedNS( varname, name, depth, active ) ZoneNamedN( varname, name, active ) +# define ZoneNamedCS( varname, color, depth, active ) ZoneNamedC( varname, color, active ) +# define ZoneNamedNCS( varname, name, color, depth, active ) ZoneNamedNC( varname, name, color, active ) + +# define ZoneScopedS( depth ) ZoneScoped +# define ZoneScopedNS( name, depth ) ZoneScopedN( name ) +# define ZoneScopedCS( color, depth ) ZoneScopedC( color ) +# define ZoneScopedNCS( name, color, depth ) ZoneScopedNC( name, color ) + +# define TracyAllocS( ptr, size, depth ) TracyAlloc( ptr, size ) +# define TracyFreeS( ptr, depth ) TracyFree( ptr ) + +# define TracyMessageS( txt, size, depth ) TracyMessage( txt, size ) +# define TracyMessageLS( txt, depth ) TracyMessageL( txt ) +# define TracyMessageCS( txt, size, color, depth ) TracyMessageC( txt, size, color ) +# define TracyMessageLCS( txt, color, depth ) TracyMessageLC( txt, color ) +#endif + +#define TracyParameterRegister( cb ) tracy::Profiler::ParameterRegister( cb ); +#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val ); + +#endif + +#endif diff --git a/libs/tracy/TracyC.h b/libs/tracy/TracyC.h @@ -0,0 +1,188 @@ +#ifndef __TRACYC_HPP__ +#define __TRACYC_HPP__ + +#include <stddef.h> +#include <stdint.h> + +#include "client/TracyCallstack.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef TRACY_ENABLE + +typedef const void* TracyCZoneCtx; + +#define TracyCZone(c,x) +#define TracyCZoneN(c,x,y) +#define TracyCZoneC(c,x,y) +#define TracyCZoneNC(c,x,y,z) +#define TracyCZoneEnd(c) +#define TracyCZoneText(c,x,y) +#define TracyCZoneName(c,x,y) + +#define TracyCAlloc(x,y) +#define TracyCFree(x) + +#define TracyCFrameMark +#define TracyCFrameMarkNamed(x) +#define TracyCFrameMarkStart(x) +#define TracyCFrameMarkEnd(x) +#define TracyCFrameImage(x,y,z,w,a) + +#define TracyCPlot(x,y) +#define TracyCMessage(x,y) +#define TracyCMessageL(x) +#define TracyCMessageC(x,y,z) +#define TracyCMessageLC(x,y) +#define TracyCAppInfo(x,y) + +#define TracyCZoneS(x,y,z) +#define TracyCZoneNS(x,y,z,w) +#define TracyCZoneCS(x,y,z,w) +#define TracyCZoneNCS(x,y,z,w,a) + +#define TracyCAllocS(x,y,z) +#define TracyCFreeS(x,y) + +#define TracyCMessageS(x,y,z) +#define TracyCMessageLS(x,y) +#define TracyCMessageCS(x,y,z,w) +#define TracyCMessageLCS(x,y,z) + +#else + +#ifndef TracyConcat +# define TracyConcat(x,y) TracyConcatIndirect(x,y) +#endif +#ifndef TracyConcatIndirect +# define TracyConcatIndirect(x,y) x##y +#endif + +struct ___tracy_source_location_data +{ + const char* name; + const char* function; + const char* file; + uint32_t line; + uint32_t color; +}; + +struct ___tracy_c_zone_context +{ + uint32_t id; + int active; +}; + +// Some containers don't support storing const types. +// This struct, as visible to user, is immutable, so treat it as if const was declared here. +typedef /*const*/ struct ___tracy_c_zone_context TracyCZoneCtx; + +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int active ); +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int depth, int active ); +TRACY_API void ___tracy_emit_zone_end( TracyCZoneCtx ctx ); +TRACY_API void ___tracy_emit_zone_text( TracyCZoneCtx ctx, const char* txt, size_t size ); +TRACY_API void ___tracy_emit_zone_name( TracyCZoneCtx ctx, const char* txt, size_t size ); + +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyCZone( ctx, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define TracyCZoneN( ctx, name, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define TracyCZoneC( ctx, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define TracyCZoneNC( ctx, name, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +#else +# define TracyCZone( ctx, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,__LINE__), active ); +# define TracyCZoneN( ctx, name, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,__LINE__), active ); +# define TracyCZoneC( ctx, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,__LINE__), active ); +# define TracyCZoneNC( ctx, name, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,__LINE__), active ); +#endif + +#define TracyCZoneEnd( ctx ) ___tracy_emit_zone_end( ctx ); + +#define TracyCZoneText( ctx, txt, size ) ___tracy_emit_zone_text( ctx, txt, size ); +#define TracyCZoneName( ctx, txt, size ) ___tracy_emit_zone_name( ctx, txt, size ); + + +TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size ); +TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int depth ); +TRACY_API void ___tracy_emit_memory_free( const void* ptr ); +TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int depth ); + +TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ); +TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ); +TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ); +TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int callstack ); + +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyCAlloc( ptr, size ) ___tracy_emit_memory_alloc_callstack( ptr, size, TRACY_CALLSTACK ) +# define TracyCFree( ptr ) ___tracy_emit_memory_alloc_free_callstack( ptr, TRACY_CALLSTACK ) + +# define TracyCMessage( txt, size ) ___tracy_emit_message( txt, size, TRACY_CALLSTACK ); +# define TracyCMessageL( txt ) ___tracy_emit_messageL( txt, TRACY_CALLSTACK ); +# define TracyCMessageC( txt, size, color ) ___tracy_emit_messageC( txt, size, color, TRACY_CALLSTACK ); +# define TracyCMessageLC( txt, color ) ___tracy_emit_messageLC( txt, color, TRACY_CALLSTACK ); +#else +# define TracyCAlloc( ptr, size ) ___tracy_emit_memory_alloc( ptr, size ); +# define TracyCFree( ptr ) ___tracy_emit_memory_free( ptr ); + +# define TracyCMessage( txt, size ) ___tracy_emit_message( txt, size, 0 ); +# define TracyCMessageL( txt ) ___tracy_emit_messageL( txt, 0 ); +# define TracyCMessageC( txt, size, color ) ___tracy_emit_messageC( txt, size, color, 0 ); +# define TracyCMessageLC( txt, color ) ___tracy_emit_messageLC( txt, color, 0 ); +#endif + + +TRACY_API void ___tracy_emit_frame_mark( const char* name ); +TRACY_API void ___tracy_emit_frame_mark_start( const char* name ); +TRACY_API void ___tracy_emit_frame_mark_end( const char* name ); +TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int flip ); + +#define TracyCFrameMark ___tracy_emit_frame_mark( 0 ); +#define TracyCFrameMarkNamed( name ) ___tracy_emit_frame_mark( name ); +#define TracyCFrameMarkStart( name ) ___tracy_emit_frame_mark_start( name ); +#define TracyCFrameMarkEnd( name ) ___tracy_emit_frame_mark_end( name ); +#define TracyCFrameImage( image, width, height, offset, flip ) ___tracy_emit_frame_image( image, width, height, offset, flip ); + + +TRACY_API void ___tracy_emit_plot( const char* name, double val ); +TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ); + +#define TracyCPlot( name, val ) ___tracy_emit_plot( name, val ); +#define TracyCAppInfo( txt, color ) ___tracy_emit_message_appinfo( txt, color ); + + +#ifdef TRACY_HAS_CALLSTACK +# define TracyCZoneS( ctx, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); +# define TracyCZoneNS( ctx, name, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); +# define TracyCZoneCS( ctx, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { NULL, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); +# define TracyCZoneNCS( ctx, name, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,__LINE__) = { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); + +# define TracyCAllocS( ptr, size, depth ) ___tracy_emit_memory_alloc_callstack( ptr, size, depth ) +# define TracyCFreeS( ptr, depth ) ___tracy_emit_memory_alloc_free_callstack( ptr, depth ) + +# define TracyCMessageS( txt, size, depth ) ___tracy_emit_message( txt, size, depth ); +# define TracyCMessageLS( txt, depth ) ___tracy_emit_messageL( txt, depth ); +# define TracyCMessageCS( txt, size, color, depth ) ___tracy_emit_messageC( txt, size, color, depth ); +# define TracyCMessageLCS( txt, color, depth ) ___tracy_emit_messageLC( txt, color, depth ); +#else +# define TracyCZoneS( ctx, depth, active ) TracyCZone( ctx, active ) +# define TracyCZoneNS( ctx, name, depth, active ) TracyCZoneN( ctx, name, active ) +# define TracyCZoneCS( ctx, color, depth, active ) TracyCZoneC( ctx, color, active ) +# define TracyCZoneNCS( ctx, name, color, depth, active ) TracyCZoneNC( ctx, name, color, active ) + +# define TracyCAllocS( ptr, size, depth ) TracyCAlloc( ptr, size ) +# define TracyCFreeS( ptr, depth ) TracyCFree( ptr ) + +# define TracyCMessageS( txt, size, depth ) TracyCMessage( txt, size ) +# define TracyCMessageLS( txt, depth ) TracyCMessageL( txt ) +# define TracyCMessageCS( txt, size, color, depth ) TracyCMessageC( txt, size, color ) +# define TracyCMessageLCS( txt, color, depth ) TracyCMessageLC( txt, color ) +#endif + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libs/tracy/TracyClient.cpp b/libs/tracy/TracyClient.cpp @@ -0,0 +1,43 @@ +// +// Tracy profiler +// ---------------- +// +// For fast integration, compile and +// link with this source file (and none +// other) in your executable (or in the +// main DLL / shared object on multi-DLL +// projects). +// + +// Define TRACY_ENABLE to enable profiler. + +#include "common/TracySystem.cpp" + +#ifdef TRACY_ENABLE + +#include "common/tracy_lz4.cpp" +#include "client/TracyProfiler.cpp" +#include "client/TracyCallstack.cpp" +#include "client/TracySysTime.cpp" +#include "client/TracySysTrace.cpp" +#include "common/TracySocket.cpp" +#include "client/tracy_rpmalloc.cpp" +#include "client/TracyDxt1.cpp" + +#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 6 +# include "libbacktrace/alloc.cpp" +# include "libbacktrace/dwarf.cpp" +# include "libbacktrace/elf.cpp" +# include "libbacktrace/fileline.cpp" +# include "libbacktrace/mmapio.cpp" +# include "libbacktrace/posix.cpp" +# include "libbacktrace/sort.cpp" +# include "libbacktrace/state.cpp" +#endif + +#ifdef _MSC_VER +# pragma comment(lib, "ws2_32.lib") +# pragma comment(lib, "dbghelp.lib") +#endif + +#endif diff --git a/libs/tracy/TracyOpenGL.hpp b/libs/tracy/TracyOpenGL.hpp @@ -0,0 +1,272 @@ +#ifndef __TRACYOPENGL_HPP__ +#define __TRACYOPENGL_HPP__ + +// Include this file after you include OpenGL 3.2 headers. + +#if !defined TRACY_ENABLE || defined __APPLE__ + +#define TracyGpuContext +#define TracyGpuNamedZone(x,y) +#define TracyGpuNamedZoneC(x,y,z) +#define TracyGpuZone(x) +#define TracyGpuZoneC(x,y) +#define TracyGpuCollect + +#define TracyGpuNamedZoneS(x,y,z) +#define TracyGpuNamedZoneCS(x,y,z,w) +#define TracyGpuZoneS(x,y) +#define TracyGpuZoneCS(x,y,z) + +namespace tracy +{ +struct SourceLocationData; +class GpuCtxScope +{ +public: + GpuCtxScope( const SourceLocationData* ) {} + GpuCtxScope( const SourceLocationData*, int depth ) {} +}; +} + +#else + +#include <atomic> +#include <assert.h> +#include <stdlib.h> + +#include "Tracy.hpp" +#include "client/TracyProfiler.hpp" +#include "client/TracyCallstack.hpp" +#include "common/TracyAlign.hpp" +#include "common/TracyAlloc.hpp" + +#if !defined GL_TIMESTAMP && defined GL_TIMESTAMP_EXT +# define GL_TIMESTAMP GL_TIMESTAMP_EXT +# define GL_QUERY_COUNTER_BITS GL_QUERY_COUNTER_BITS_EXT +# define glGetQueryObjectiv glGetQueryObjectivEXT +# define glGetQueryObjectui64v glGetQueryObjectui64vEXT +# define glQueryCounter glQueryCounterEXT +#endif + +#define TracyGpuContext tracy::GetGpuCtx().ptr = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::GetGpuCtx().ptr) tracy::GpuCtx; +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyGpuNamedZone( varname, name ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK ); +# define TracyGpuNamedZoneC( varname, name, color ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK ); +# define TracyGpuZone( name ) TracyGpuNamedZoneS( ___tracy_gpu_zone, name, TRACY_CALLSTACK ) +# define TracyGpuZoneC( name, color ) TracyGpuNamedZoneCS( ___tracy_gpu_zone, name, color, TRACY_CALLSTACK ) +#else +# define TracyGpuNamedZone( varname, name ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__) ); +# define TracyGpuNamedZoneC( varname, name, color ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__) ); +# define TracyGpuZone( name ) TracyGpuNamedZone( ___tracy_gpu_zone, name ) +# define TracyGpuZoneC( name, color ) TracyGpuNamedZoneC( ___tracy_gpu_zone, name, color ) +#endif +#define TracyGpuCollect tracy::GetGpuCtx().ptr->Collect(); + +#ifdef TRACY_HAS_CALLSTACK +# define TracyGpuNamedZoneS( varname, name, depth ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), depth ); +# define TracyGpuNamedZoneCS( varname, name, color, depth ) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), depth ); +# define TracyGpuZoneS( name, depth ) TracyGpuNamedZoneS( ___tracy_gpu_zone, name, depth ) +# define TracyGpuZoneCS( name, color, depth ) TracyGpuNamedZoneCS( ___tracy_gpu_zone, name, color, depth ) +#else +# define TracyGpuNamedZoneS( varname, name, depth ) TracyGpuNamedZone( varname, name ) +# define TracyGpuNamedZoneCS( varname, name, color, depth ) TracyGpuNamedZoneC( varname, name, color ) +# define TracyGpuZoneS( name, depth ) TracyGpuZone( name ) +# define TracyGpuZoneCS( name, color, depth ) TracyGpuZoneC( name, color ) +#endif + +namespace tracy +{ + +class GpuCtx +{ + friend class GpuCtxScope; + + enum { QueryCount = 64 * 1024 }; + +public: + GpuCtx() + : m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) ) + , m_head( 0 ) + , m_tail( 0 ) + { + assert( m_context != 255 ); + + glGenQueries( QueryCount, m_query ); + + int64_t tgpu; + glGetInteger64v( GL_TIMESTAMP, &tgpu ); + int64_t tcpu = Profiler::GetTime(); + + GLint bits; + glGetQueryiv( GL_TIMESTAMP, GL_QUERY_COUNTER_BITS, &bits ); + + const float period = 1.f; + Magic magic; + const auto thread = GetThreadHandle(); + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::GpuNewContext ); + MemWrite( &item->gpuNewContext.cpuTime, tcpu ); + MemWrite( &item->gpuNewContext.gpuTime, tgpu ); + MemWrite( &item->gpuNewContext.thread, thread ); + MemWrite( &item->gpuNewContext.period, period ); + MemWrite( &item->gpuNewContext.context, m_context ); + MemWrite( &item->gpuNewContext.accuracyBits, (uint8_t)bits ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + tail.store( magic + 1, std::memory_order_release ); + } + + void Collect() + { + ZoneScopedC( Color::Red4 ); + + if( m_tail == m_head ) return; + +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) + { + m_head = m_tail = 0; + return; + } +#endif + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + + while( m_tail != m_head ) + { + GLint available; + glGetQueryObjectiv( m_query[m_tail], GL_QUERY_RESULT_AVAILABLE, &available ); + if( !available ) return; + + uint64_t time; + glGetQueryObjectui64v( m_query[m_tail], GL_QUERY_RESULT, &time ); + + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::GpuTime ); + MemWrite( &item->gpuTime.gpuTime, (int64_t)time ); + MemWrite( &item->gpuTime.queryId, (uint16_t)m_tail ); + MemWrite( &item->gpuTime.context, m_context ); + tail.store( magic + 1, std::memory_order_release ); + + m_tail = ( m_tail + 1 ) % QueryCount; + } + } + +private: + tracy_force_inline unsigned int NextQueryId() + { + const auto id = m_head; + m_head = ( m_head + 1 ) % QueryCount; + assert( m_head != m_tail ); + return id; + } + + tracy_force_inline unsigned int TranslateOpenGlQueryId( unsigned int id ) + { + return m_query[id]; + } + + tracy_force_inline uint8_t GetId() const + { + return m_context; + } + + unsigned int m_query[QueryCount]; + uint8_t m_context; + + unsigned int m_head; + unsigned int m_tail; +}; + +class GpuCtxScope +{ +public: + tracy_force_inline GpuCtxScope( const SourceLocationData* srcloc ) +#ifdef TRACY_ON_DEMAND + : m_active( GetProfiler().IsConnected() ) +#endif + { +#ifdef TRACY_ON_DEMAND + if( !m_active ) return; +#endif + const auto queryId = GetGpuCtx().ptr->NextQueryId(); + glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP ); + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::GpuZoneBegin ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + memset( &item->gpuZoneBegin.thread, 0, sizeof( item->gpuZoneBegin.thread ) ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() ); + tail.store( magic + 1, std::memory_order_release ); + } + + tracy_force_inline GpuCtxScope( const SourceLocationData* srcloc, int depth ) +#ifdef TRACY_ON_DEMAND + : m_active( GetProfiler().IsConnected() ) +#endif + { +#ifdef TRACY_ON_DEMAND + if( !m_active ) return; +#endif + const auto queryId = GetGpuCtx().ptr->NextQueryId(); + glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP ); + + Magic magic; + const auto thread = GetThreadHandle(); + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstack ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + MemWrite( &item->gpuZoneBegin.thread, thread ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() ); + tail.store( magic + 1, std::memory_order_release ); + + GetProfiler().SendCallstack( depth ); + } + + tracy_force_inline ~GpuCtxScope() + { +#ifdef TRACY_ON_DEMAND + if( !m_active ) return; +#endif + const auto queryId = GetGpuCtx().ptr->NextQueryId(); + glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP ); + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::GpuZoneEnd ); + MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() ); + memset( &item->gpuZoneEnd.thread, 0, sizeof( item->gpuZoneEnd.thread ) ); + MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneEnd.context, GetGpuCtx().ptr->GetId() ); + tail.store( magic + 1, std::memory_order_release ); + } + +private: +#ifdef TRACY_ON_DEMAND + const bool m_active; +#endif +}; + +} + +#endif + +#endif diff --git a/libs/tracy/client/TracyArmCpuTable.hpp b/libs/tracy/client/TracyArmCpuTable.hpp @@ -0,0 +1,319 @@ +namespace tracy +{ + +static const char* DecodeArmImplementer( uint32_t v ) +{ + static char buf[16]; + switch( v ) + { + case 0x41: return "ARM"; + case 0x42: return "Broadcom"; + case 0x43: return "Cavium"; + case 0x44: return "DEC"; + case 0x46: return "Fujitsu"; + case 0x48: return "HiSilicon"; + case 0x4d: return "Motorola"; + case 0x4e: return "Nvidia"; + case 0x50: return "Applied Micro"; + case 0x51: return "Qualcomm"; + case 0x53: return "Samsung"; + case 0x54: return "Texas Instruments"; + case 0x56: return "Marvell"; + case 0x61: return "Apple"; + case 0x66: return "Faraday"; + case 0x68: return "HXT"; + case 0x69: return "Intel"; + default: break; + } + sprintf( buf, "0x%x", v ); + return buf; +} + +static const char* DecodeArmPart( uint32_t impl, uint32_t part ) +{ + static char buf[16]; + switch( impl ) + { + case 0x41: + switch( part ) + { + case 0x810: return "810"; + case 0x920: return "920"; + case 0x922: return "922"; + case 0x926: return "926"; + case 0x940: return "940"; + case 0x946: return "946"; + case 0x966: return "966"; + case 0xa20: return "1020"; + case 0xa22: return "1022"; + case 0xa26: return "1026"; + case 0xb02: return "11 MPCore"; + case 0xb36: return "1136"; + case 0xb56: return "1156"; + case 0xb76: return "1176"; + case 0xc05: return " Cortex-A5"; + case 0xc07: return " Cortex-A7"; + case 0xc08: return " Cortex-A8"; + case 0xc09: return " Cortex-A9"; + case 0xc0c: return " Cortex-A12"; + case 0xc0d: return " Rockchip RK3288"; + case 0xc0f: return " Cortex-A15"; + case 0xc0e: return " Cortex-A17"; + case 0xc14: return " Cortex-R4"; + case 0xc15: return " Cortex-R5"; + case 0xc17: return " Cortex-R7"; + case 0xc18: return " Cortex-R8"; + case 0xc20: return " Cortex-M0"; + case 0xc21: return " Cortex-M1"; + case 0xc23: return " Cortex-M3"; + case 0xc24: return " Cortex-M4"; + case 0xc27: return " Cortex-M7"; + case 0xc60: return " Cortex-M0+"; + case 0xd00: return " AArch64 simulator"; + case 0xd01: return " Cortex-A32"; + case 0xd03: return " Cortex-A53"; + case 0xd04: return " Cortex-A35"; + case 0xd05: return " Cortex-A55"; + case 0xd06: return " Cortex-A65"; + case 0xd07: return " Cortex-A57"; + case 0xd08: return " Cortex-A72"; + case 0xd09: return " Cortex-A73"; + case 0xd0a: return " Cortex-A75"; + case 0xd0b: return " Cortex-A76"; + case 0xd0c: return " Neoverse N1"; + case 0xd0d: return " Cortex-A77"; + case 0xd0e: return " Cortex-A76AE"; + case 0xd0f: return " AEMv8"; + case 0xd13: return " Cortex-R52"; + case 0xd20: return " Cortex-M23"; + case 0xd21: return " Cortex-M33"; + case 0xd4a: return " Neoverse E1"; + default: break; + } + case 0x42: + switch( part ) + { + case 0xf: return " Brahma B15"; + case 0x100: return " Brahma B53"; + case 0x516: return " ThunderX2"; + default: break; + } + case 0x43: + switch( part ) + { + case 0xa0: return " ThunderX"; + case 0xa1: return " ThunderX 88XX"; + case 0xa2: return " ThunderX 81XX"; + case 0xa3: return " ThunderX 83XX"; + case 0xaf: return " ThunderX2 99xx"; + default: break; + } + case 0x44: + switch( part ) + { + case 0xa10: return " SA110"; + case 0xa11: return " SA1100"; + default: break; + } + case 0x46: + switch( part ) + { + case 0x1: return " A64FX"; + default: break; + } + case 0x48: + switch( part ) + { + case 0xd01: return " TSV100"; + case 0xd40: return " Kirin 980"; + default: break; + } + case 0x4e: + switch( part ) + { + case 0x0: return " Denver"; + case 0x3: return " Denver 2"; + case 0x4: return " Carmel"; + default: break; + } + case 0x50: + switch( part ) + { + case 0x0: return " X-Gene"; + default: break; + } + case 0x51: + switch( part ) + { + case 0xf: return " Scorpion"; + case 0x2d: return " Scorpion"; + case 0x4d: return " Krait"; + case 0x6f: return " Krait"; + case 0x200: return " Kryo"; + case 0x201: return " Kryo Silver (Snapdragon 821)"; + case 0x205: return " Kryo Gold"; + case 0x211: return " Kryo Silver (Snapdragon 820)"; + case 0x800: return " Kryo 260 / 280 Gold"; + case 0x801: return " Kryo 260 / 280 Silver"; + case 0x802: return " Kryo 385 Gold"; + case 0x803: return " Kryo 385 Silver"; + case 0x804: return " Kryo 485 Gold"; + case 0xc00: return " Falkor"; + case 0xc01: return " Saphira"; + default: break; + } + case 0x53: + switch( part ) + { + case 0x1: return " Exynos M1/M2"; + case 0x2: return " Exynos M3"; + default: break; + } + case 0x56: + switch( part ) + { + case 0x131: return " Feroceon 88FR131"; + case 0x581: return " PJ4 / PJ4B"; + case 0x584: return " PJ4B-MP / PJ4C"; + default: break; + } + case 0x61: + switch( part ) + { + case 0x1: return " Cyclone"; + case 0x2: return " Typhoon"; + case 0x3: return " Typhoon/Capri"; + case 0x4: return " Twister"; + case 0x5: return " Twister/Elba/Malta"; + case 0x6: return " Hurricane"; + case 0x7: return " Hurricane/Myst"; + default: break; + } + case 0x66: + switch( part ) + { + case 0x526: return " FA526"; + case 0x626: return " FA626"; + default: break; + } + case 0x68: + switch( part ) + { + case 0x0: return " Phecda"; + default: break; + } + default: break; + } + sprintf( buf, " 0x%x", part ); + return buf; +} + +static const char* DecodeIosDevice( const char* id ) +{ + static const char* DeviceTable[] = { + "i386", "32-bit simulator", + "x86_64", "64-bit simulator", + "iPhone1,1", "iPhone", + "iPhone1,2", "iPhone 3G", + "iPhone2,1", "iPhone 3GS", + "iPhone3,1", "iPhone 4 (GSM)", + "iPhone3,2", "iPhone 4 (GSM)", + "iPhone3,3", "iPhone 4 (CDMA)", + "iPhone4,1", "iPhone 4S", + "iPhone5,1", "iPhone 5 (A1428)", + "iPhone5,2", "iPhone 5 (A1429)", + "iPhone5,3", "iPhone 5c (A1456/A1532)", + "iPhone5,4", "iPhone 5c (A1507/A1516/1526/A1529)", + "iPhone6,1", "iPhone 5s (A1433/A1533)", + "iPhone6,2", "iPhone 5s (A1457/A1518/A1528/A1530)", + "iPhone7,1", "iPhone 6 Plus", + "iPhone7,2", "iPhone 6", + "iPhone8,1", "iPhone 6S", + "iPhone8,2", "iPhone 6S Plus", + "iPhone8,4", "iPhone SE", + "iPhone9,1", "iPhone 7 (CDMA)", + "iPhone9,2", "iPhone 7 Plus (CDMA)", + "iPhone9,3", "iPhone 7 (GSM)", + "iPhone9,4", "iPhone 7 Plus (GSM)", + "iPhone10,1", "iPhone 8 (CDMA)", + "iPhone10,2", "iPhone 8 Plus (CDMA)", + "iPhone10,3", "iPhone X (CDMA)", + "iPhone10,4", "iPhone 8 (GSM)", + "iPhone10,5", "iPhone 8 Plus (GSM)", + "iPhone10,6", "iPhone X (GSM)", + "iPhone11,2", "iPhone XS", + "iPhone11,4", "iPhone XS Max", + "iPhone11,6", "iPhone XS Max China", + "iPhone11,8", "iPhone XR", + "iPad1,1", "iPad (A1219/A1337)", + "iPad2,1", "iPad 2 (A1395)", + "iPad2,2", "iPad 2 (A1396)", + "iPad2,3", "iPad 2 (A1397)", + "iPad2,4", "iPad 2 (A1395)", + "iPad2,5", "iPad Mini (A1432)", + "iPad2,6", "iPad Mini (A1454)", + "iPad2,7", "iPad Mini (A1455)", + "iPad3,1", "iPad 3 (A1416)", + "iPad3,2", "iPad 3 (A1403)", + "iPad3,3", "iPad 3 (A1430)", + "iPad3,4", "iPad 4 (A1458)", + "iPad3,5", "iPad 4 (A1459)", + "iPad3,6", "iPad 4 (A1460)", + "iPad4,1", "iPad Air (A1474)", + "iPad4,2", "iPad Air (A1475)", + "iPad4,3", "iPad Air (A1476)", + "iPad4,4", "iPad Mini 2 (A1489)", + "iPad4,5", "iPad Mini 2 (A1490)", + "iPad4,6", "iPad Mini 2 (A1491)", + "iPad4,7", "iPad Mini 3 (A1599)", + "iPad4,8", "iPad Mini 3 (A1600)", + "iPad4,9", "iPad Mini 3 (A1601)", + "iPad5,1", "iPad Mini 4 (A1538)", + "iPad5,2", "iPad Mini 4 (A1550)", + "iPad5,3", "iPad Air 2 (A1566)", + "iPad5,4", "iPad Air 2 (A1567)", + "iPad6,3", "iPad Pro 9.7\" (A1673)", + "iPad6,4", "iPad Pro 9.7\" (A1674)", + "iPad6,5", "iPad Pro 9.7\" (A1675)", + "iPad6,7", "iPad Pro 12.9\" (A1584)", + "iPad6,8", "iPad Pro 12.9\" (A1652)", + "iPad6,11", "iPad 5th gen (A1822)", + "iPad6,12", "iPad 5th gen (A1823)", + "iPad7,1", "iPad Pro 12.9\" 2nd gen (A1670)", + "iPad7,2", "iPad Pro 12.9\" 2nd gen (A1671/A1821)", + "iPad7,3", "iPad Pro 10.5\" (A1701)", + "iPad7,4", "iPad Pro 10.5\" (A1709)", + "iPad7,5", "iPad 6th gen (A1893)", + "iPad7,6", "iPad 6th gen (A1954)", + "iPad8,1", "iPad Pro 11\" (A1980)", + "iPad8,2", "iPad Pro 11\" (A1980)", + "iPad8,3", "iPad Pro 11\" (A1934/A1979/A2013)", + "iPad8,4", "iPad Pro 11\" (A1934/A1979/A2013)", + "iPad8,5", "iPad Pro 12.9\" 3rd gen (A1876)", + "iPad8,6", "iPad Pro 12.9\" 3rd gen (A1876)", + "iPad8,7", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)", + "iPad8,8", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)", + "iPad11,1", "iPad Mini 5th gen (A2133)", + "iPad11,2", "iPad Mini 5th gen (A2124/A2125/A2126)", + "iPad11,3", "iPad Air 3rd gen (A2152)", + "iPad11,4", "iPad Air 3rd gen (A2123/A2153/A2154)", + "iPod1,1", "iPod Touch", + "iPod2,1", "iPod Touch 2nd gen", + "iPod3,1", "iPod Touch 3rd gen", + "iPod4,1", "iPod Touch 4th gen", + "iPod5,1", "iPod Touch 5th gen", + "iPod7,1", "iPod Touch 6th gen", + "iPod9,1", "iPod Touch 7th gen", + nullptr + }; + + auto ptr = DeviceTable; + while( *ptr ) + { + if( strcmp( ptr[0], id ) == 0 ) return ptr[1]; + ptr += 2; + } + return id; +} + +} diff --git a/libs/tracy/client/TracyCallstack.cpp b/libs/tracy/client/TracyCallstack.cpp @@ -0,0 +1,603 @@ +#include <stdio.h> +#include <string.h> +#include "TracyCallstack.hpp" + +#ifdef TRACY_HAS_CALLSTACK + +#if TRACY_HAS_CALLSTACK == 1 +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include <windows.h> +# ifdef _MSC_VER +# pragma warning( push ) +# pragma warning( disable : 4091 ) +# endif +# include <dbghelp.h> +# ifdef _MSC_VER +# pragma warning( pop ) +# endif +#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 6 +# include "../libbacktrace/backtrace.hpp" +# include <dlfcn.h> +# include <cxxabi.h> +#elif TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 5 +# include <dlfcn.h> +# include <cxxabi.h> +#endif + +namespace tracy +{ + +#if TRACY_HAS_CALLSTACK == 1 + +enum { MaxCbTrace = 16 }; + +int cb_num; +CallstackEntry cb_data[MaxCbTrace]; + +extern "C" { t_RtlWalkFrameChain RtlWalkFrameChain = 0; } + +#if defined __MINGW32__ && API_VERSION_NUMBER < 12 +extern "C" { +// Actual required API_VERSION_NUMBER is unknown because it is undocumented. These functions are not present in at least v11. +DWORD IMAGEAPI SymAddrIncludeInlineTrace(HANDLE hProcess, DWORD64 Address); +BOOL IMAGEAPI SymQueryInlineTrace(HANDLE hProcess, DWORD64 StartAddress, DWORD StartContext, DWORD64 StartRetAddress, + DWORD64 CurAddress, LPDWORD CurContext, LPDWORD CurFrameIndex); +BOOL IMAGEAPI SymFromInlineContext(HANDLE hProcess, DWORD64 Address, ULONG InlineContext, PDWORD64 Displacement, + PSYMBOL_INFO Symbol); +BOOL IMAGEAPI SymGetLineFromInlineContext(HANDLE hProcess, DWORD64 qwAddr, ULONG InlineContext, + DWORD64 qwModuleBaseAddress, PDWORD pdwDisplacement, PIMAGEHLP_LINE64 Line64); +}; +#endif + +void InitCallstack() +{ +#ifdef UNICODE + RtlWalkFrameChain = (t_RtlWalkFrameChain)GetProcAddress( GetModuleHandle( L"ntdll.dll" ), "RtlWalkFrameChain" ); +#else + RtlWalkFrameChain = (t_RtlWalkFrameChain)GetProcAddress( GetModuleHandle( "ntdll.dll" ), "RtlWalkFrameChain" ); +#endif + SymInitialize( GetCurrentProcess(), nullptr, true ); + SymSetOptions( SYMOPT_LOAD_LINES ); +} + +const char* DecodeCallstackPtrFast( uint64_t ptr ) +{ + static char ret[1024]; + const auto proc = GetCurrentProcess(); + + char buf[sizeof( SYMBOL_INFO ) + 1024]; + auto si = (SYMBOL_INFO*)buf; + si->SizeOfStruct = sizeof( SYMBOL_INFO ); + si->MaxNameLen = 1024; + + if( SymFromAddr( proc, ptr, nullptr, si ) == 0 ) + { + *ret = '\0'; + } + else + { + memcpy( ret, si->Name, si->NameLen ); + ret[si->NameLen] = '\0'; + } + return ret; +} + +CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) +{ + int write; + const auto proc = GetCurrentProcess(); +#ifndef __CYGWIN__ + DWORD inlineNum = SymAddrIncludeInlineTrace( proc, ptr ); + if( inlineNum > MaxCbTrace - 1 ) inlineNum = MaxCbTrace - 1; + DWORD ctx = 0; + DWORD idx; + BOOL doInline = FALSE; + if( inlineNum != 0 ) doInline = SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx ); + if( doInline ) + { + write = inlineNum; + cb_num = 1 + inlineNum; + } + else +#endif + { + write = 0; + cb_num = 1; + } + + char buf[sizeof( SYMBOL_INFO ) + 1024]; + auto si = (SYMBOL_INFO*)buf; + si->SizeOfStruct = sizeof( SYMBOL_INFO ); + si->MaxNameLen = 1024; + + if( SymFromAddr( proc, ptr, nullptr, si ) == 0 ) + { + memcpy( si->Name, "[unknown]", 10 ); + si->NameLen = 9; + } + + IMAGEHLP_LINE64 line; + DWORD displacement = 0; + line.SizeOfStruct = sizeof(IMAGEHLP_LINE64); + + { + auto name = (char*)tracy_malloc(si->NameLen + 1); + memcpy(name, si->Name, si->NameLen); + name[si->NameLen] = '\0'; + + cb_data[write].name = name; + + const char* filename; + if (SymGetLineFromAddr64(proc, ptr, &displacement, &line) == 0) + { + filename = "[unknown]"; + cb_data[write].line = 0; + } + else + { + filename = line.FileName; + cb_data[write].line = line.LineNumber; + } + + const auto fsz = strlen(filename); + auto file = (char*)tracy_malloc(fsz + 1); + memcpy(file, filename, fsz); + file[fsz] = '\0'; + + cb_data[write].file = file; + } + +#ifndef __CYGWIN__ + if( doInline ) + { + for( DWORD i=0; i<inlineNum; i++ ) + { + auto& cb = cb_data[i]; + + if( SymFromInlineContext( proc, ptr, ctx, nullptr, si ) == 0 ) + { + memcpy( si->Name, "[unknown]", 10 ); + si->NameLen = 9; + } + + auto name = (char*)tracy_malloc( si->NameLen + 1 ); + memcpy( name, si->Name, si->NameLen ); + name[si->NameLen] = '\0'; + cb.name = name; + + const char* filename; + if( SymGetLineFromInlineContext( proc, ptr, ctx, 0, &displacement, &line ) == 0 ) + { + filename = "[unknown]"; + cb.line = 0; + } + else + { + filename = line.FileName; + cb.line = line.LineNumber; + } + + const auto fsz = strlen( filename ); + auto file = (char*)tracy_malloc( fsz + 1 ); + memcpy( file, filename, fsz ); + file[fsz] = '\0'; + cb.file = file; + + ctx++; + } + } +#endif + + return { cb_data, uint8_t( cb_num ) }; +} + +#elif TRACY_HAS_CALLSTACK == 4 + +void InitCallstack() +{ +} + +const char* DecodeCallstackPtrFast( uint64_t ptr ) +{ + static char ret[1024]; + auto vptr = (void*)ptr; + char** sym = nullptr; + const char* symname = nullptr; + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) && dlinfo.dli_sname ) + { + symname = dlinfo.dli_sname; + } + else + { + sym = backtrace_symbols( &vptr, 1 ); + if( sym ) + { + symname = *sym; + } + } + if( symname ) + { + strcpy( ret, symname ); + } + else + { + *ret = '\0'; + } + return ret; +} + +CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) +{ + static CallstackEntry cb; + cb.line = 0; + + char* demangled = nullptr; + const char* symname = nullptr; + const char* symloc = nullptr; + auto vptr = (void*)ptr; + char** sym = nullptr; + ptrdiff_t symoff = 0; + + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) ) + { + symloc = dlinfo.dli_fname; + symname = dlinfo.dli_sname; + symoff = (char*)ptr - (char*)dlinfo.dli_saddr; + + if( symname && symname[0] == '_' ) + { + size_t len = 0; + int status; + demangled = abi::__cxa_demangle( symname, nullptr, &len, &status ); + if( status == 0 ) + { + symname = demangled; + } + } + } + + if( !symname ) + { + sym = backtrace_symbols( &vptr, 1 ); + if( !sym ) + { + symname = "[unknown]"; + } + else + { + symname = *sym; + } + } + if( !symloc ) + { + symloc = "[unknown]"; + } + + if( symoff == 0 ) + { + const auto namelen = strlen( symname ); + auto name = (char*)tracy_malloc( namelen + 1 ); + memcpy( name, symname, namelen ); + name[namelen] = '\0'; + cb.name = name; + } + else + { + char buf[32]; + const auto offlen = sprintf( buf, " + %td", symoff ); + const auto namelen = strlen( symname ); + auto name = (char*)tracy_malloc( namelen + offlen + 1 ); + memcpy( name, symname, namelen ); + memcpy( name + namelen, buf, offlen ); + name[namelen + offlen] = '\0'; + cb.name = name; + } + + char buf[32]; + const auto addrlen = sprintf( buf, " [%p]", (void*)ptr ); + const auto loclen = strlen( symloc ); + auto loc = (char*)tracy_malloc( loclen + addrlen + 1 ); + memcpy( loc, symloc, loclen ); + memcpy( loc + loclen, buf, addrlen ); + loc[loclen + addrlen] = '\0'; + cb.file = loc; + + if( sym ) free( sym ); + if( demangled ) free( demangled ); + + return { &cb, 1 }; +} + +#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 6 + +enum { MaxCbTrace = 16 }; + +struct backtrace_state* cb_bts; +int cb_num; +CallstackEntry cb_data[MaxCbTrace]; + +void InitCallstack() +{ + cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr ); +} + +static inline char* CopyString( const char* src ) +{ + const auto sz = strlen( src ); + auto dst = (char*)tracy_malloc( sz + 1 ); + memcpy( dst, src, sz ); + dst[sz] = '\0'; + return dst; +} + +static int FastCallstackDataCb( void* data, uintptr_t pc, const char* fn, int lineno, const char* function ) +{ + if( function ) + { + strcpy( (char*)data, function ); + } + else + { + const char* symname = nullptr; + auto vptr = (void*)pc; + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) ) + { + symname = dlinfo.dli_sname; + } + if( symname ) + { + strcpy( (char*)data, symname ); + } + else + { + *(char*)data = '\0'; + } + } + return 1; +} + +static void FastCallstackErrorCb( void* data, const char* /*msg*/, int /*errnum*/ ) +{ + *(char*)data = '\0'; +} + +const char* DecodeCallstackPtrFast( uint64_t ptr ) +{ + static char ret[1024]; + backtrace_pcinfo( cb_bts, ptr, FastCallstackDataCb, FastCallstackErrorCb, ret ); + return ret; +} + +static int CallstackDataCb( void* /*data*/, uintptr_t pc, const char* fn, int lineno, const char* function ) +{ + enum { DemangleBufLen = 64*1024 }; + char demangled[DemangleBufLen]; + + if( !fn && !function ) + { + const char* symname = nullptr; + const char* symloc = nullptr; + auto vptr = (void*)pc; + ptrdiff_t symoff = 0; + + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) ) + { + symloc = dlinfo.dli_fname; + symname = dlinfo.dli_sname; + symoff = (char*)pc - (char*)dlinfo.dli_saddr; + + if( symname && symname[0] == '_' ) + { + size_t len = DemangleBufLen; + int status; + abi::__cxa_demangle( symname, demangled, &len, &status ); + if( status == 0 ) + { + symname = demangled; + } + } + } + + if( !symname ) symname = "[unknown]"; + if( !symloc ) symloc = "[unknown]"; + + if( symoff == 0 ) + { + cb_data[cb_num].name = CopyString( symname ); + } + else + { + char buf[32]; + const auto offlen = sprintf( buf, " + %td", symoff ); + const auto namelen = strlen( symname ); + auto name = (char*)tracy_malloc( namelen + offlen + 1 ); + memcpy( name, symname, namelen ); + memcpy( name + namelen, buf, offlen ); + name[namelen + offlen] = '\0'; + cb_data[cb_num].name = name; + } + + char buf[32]; + const auto addrlen = sprintf( buf, " [%p]", (void*)pc ); + const auto loclen = strlen( symloc ); + auto loc = (char*)tracy_malloc( loclen + addrlen + 1 ); + memcpy( loc, symloc, loclen ); + memcpy( loc + loclen, buf, addrlen ); + loc[loclen + addrlen] = '\0'; + cb_data[cb_num].file = loc; + + cb_data[cb_num].line = 0; + } + else + { + if( !fn ) fn = "[unknown]"; + if( !function ) + { + function = "[unknown]"; + } + else + { + if( function[0] == '_' ) + { + size_t len = DemangleBufLen; + int status; + abi::__cxa_demangle( function, demangled, &len, &status ); + if( status == 0 ) + { + function = demangled; + } + } + } + + cb_data[cb_num].name = CopyString( function ); + cb_data[cb_num].file = CopyString( fn ); + cb_data[cb_num].line = lineno; + } + + if( ++cb_num >= MaxCbTrace ) + { + return 1; + } + else + { + return 0; + } +} + +static void CallstackErrorCb( void* /*data*/, const char* /*msg*/, int /*errnum*/ ) +{ + for( int i=0; i<cb_num; i++ ) + { + tracy_free( (void*)cb_data[i].name ); + tracy_free( (void*)cb_data[i].file ); + } + + cb_data[0].name = CopyString( "[error]" ); + cb_data[0].file = CopyString( "[error]" ); + cb_data[0].line = 0; + + cb_num = 1; +} + +CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) +{ + cb_num = 0; + backtrace_pcinfo( cb_bts, ptr, CallstackDataCb, CallstackErrorCb, nullptr ); + assert( cb_num > 0 ); + return { cb_data, uint8_t( cb_num ) }; +} + +#elif TRACY_HAS_CALLSTACK == 5 + +void InitCallstack() +{ +} + +const char* DecodeCallstackPtrFast( uint64_t ptr ) +{ + static char ret[1024]; + auto vptr = (void*)ptr; + char** sym = nullptr; + const char* symname = nullptr; + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) && dlinfo.dli_sname ) + { + symname = dlinfo.dli_sname; + } + if( symname ) + { + strcpy( ret, symname ); + } + else + { + *ret = '\0'; + } + return ret; +} + +CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) +{ + static CallstackEntry cb; + cb.line = 0; + + char* demangled = nullptr; + const char* symname = nullptr; + const char* symloc = nullptr; + auto vptr = (void*)ptr; + char** sym = nullptr; + ptrdiff_t symoff = 0; + + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) ) + { + symloc = dlinfo.dli_fname; + symname = dlinfo.dli_sname; + symoff = (char*)ptr - (char*)dlinfo.dli_saddr; + + if( symname && symname[0] == '_' ) + { + size_t len = 0; + int status; + demangled = abi::__cxa_demangle( symname, nullptr, &len, &status ); + if( status == 0 ) + { + symname = demangled; + } + } + } + + if( !symname ) + { + symname = "[unknown]"; + } + if( !symloc ) + { + symloc = "[unknown]"; + } + + if( symoff == 0 ) + { + const auto namelen = strlen( symname ); + auto name = (char*)tracy_malloc( namelen + 1 ); + memcpy( name, symname, namelen ); + name[namelen] = '\0'; + cb.name = name; + } + else + { + char buf[32]; + const auto offlen = sprintf( buf, " + %td", symoff ); + const auto namelen = strlen( symname ); + auto name = (char*)tracy_malloc( namelen + offlen + 1 ); + memcpy( name, symname, namelen ); + memcpy( name + namelen, buf, offlen ); + name[namelen + offlen] = '\0'; + cb.name = name; + } + + char buf[32]; + const auto addrlen = sprintf( buf, " [%p]", (void*)ptr ); + const auto loclen = strlen( symloc ); + auto loc = (char*)tracy_malloc( loclen + addrlen + 1 ); + memcpy( loc, symloc, loclen ); + memcpy( loc + loclen, buf, addrlen ); + loc[loclen + addrlen] = '\0'; + cb.file = loc; + + if( sym ) free( sym ); + if( demangled ) free( demangled ); + + return { &cb, 1 }; +} + +#endif + +} + +#endif diff --git a/libs/tracy/client/TracyCallstack.h b/libs/tracy/client/TracyCallstack.h @@ -0,0 +1,29 @@ +#ifndef __TRACYCALLSTACK_H__ +#define __TRACYCALLSTACK_H__ + +#if !defined _WIN32 && !defined __CYGWIN__ +# include <sys/param.h> +#endif + +#if defined _WIN32 || defined __CYGWIN__ +# define TRACY_HAS_CALLSTACK 1 +#elif defined __ANDROID__ +# if !defined __arm__ || __ANDROID_API__ >= 21 +# define TRACY_HAS_CALLSTACK 2 +# else +# define TRACY_HAS_CALLSTACK 5 +# endif +#elif defined __linux +// XXX: diesel changes +// # if defined _GNU_SOURCE && defined __GLIBC__ +// # define TRACY_HAS_CALLSTACK 3 +// # else +// # define TRACY_HAS_CALLSTACK 2 +// # endif +#elif defined __APPLE__ +# define TRACY_HAS_CALLSTACK 4 +#elif defined BSD +# define TRACY_HAS_CALLSTACK 6 +#endif + +#endif diff --git a/libs/tracy/client/TracyCallstack.hpp b/libs/tracy/client/TracyCallstack.hpp @@ -0,0 +1,112 @@ +#ifndef __TRACYCALLSTACK_HPP__ +#define __TRACYCALLSTACK_HPP__ + +#include "TracyCallstack.h" + +#if TRACY_HAS_CALLSTACK == 1 +extern "C" +{ + typedef unsigned long (__stdcall *t_RtlWalkFrameChain)( void**, unsigned long, unsigned long ); + extern t_RtlWalkFrameChain RtlWalkFrameChain; +} +#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5 +# include <unwind.h> +#elif TRACY_HAS_CALLSTACK >= 3 +# include <execinfo.h> +#endif + + +#ifdef TRACY_HAS_CALLSTACK + +#include <assert.h> +#include <stdint.h> + +#include "../common/TracyAlloc.hpp" +#include "../common/TracyForceInline.hpp" + +namespace tracy +{ + +struct CallstackEntry +{ + const char* name; + const char* file; + uint32_t line; +}; + +struct CallstackEntryData +{ + const CallstackEntry* data; + uint8_t size; +}; + +const char* DecodeCallstackPtrFast( uint64_t ptr ); +CallstackEntryData DecodeCallstackPtr( uint64_t ptr ); +void InitCallstack(); + +#if TRACY_HAS_CALLSTACK == 1 + +static tracy_force_inline void* Callstack( int depth ) +{ + assert( depth >= 1 && depth < 63 ); + + auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) ); + const auto num = RtlWalkFrameChain( (void**)( trace + 1 ), depth, 0 ); + *trace = num; + + return trace; +} + +#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5 + +struct BacktraceState +{ + void** current; + void** end; +}; + +static _Unwind_Reason_Code tracy_unwind_callback( struct _Unwind_Context* ctx, void* arg ) +{ + auto state = (BacktraceState*)arg; + uintptr_t pc = _Unwind_GetIP( ctx ); + if( pc ) + { + if( state->current == state->end ) return _URC_END_OF_STACK; + *state->current++ = (void*)pc; + } + return _URC_NO_REASON; +} + +static tracy_force_inline void* Callstack( int depth ) +{ + assert( depth >= 1 && depth < 63 ); + + auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) ); + BacktraceState state = { (void**)(trace+1), (void**)(trace+1+depth) }; + _Unwind_Backtrace( tracy_unwind_callback, &state ); + + *trace = (uintptr_t*)state.current - trace + 1; + + return trace; +} + +#elif TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 + +static tracy_force_inline void* Callstack( int depth ) +{ + assert( depth >= 1 ); + + auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) ); + const auto num = backtrace( (void**)(trace+1), depth ); + *trace = num; + + return trace; +} + +#endif + +} + +#endif + +#endif diff --git a/libs/tracy/client/TracyDxt1.cpp b/libs/tracy/client/TracyDxt1.cpp @@ -0,0 +1,646 @@ +#include "TracyDxt1.hpp" +#include "../common/TracyForceInline.hpp" + +#include <assert.h> +#include <stdint.h> +#include <string.h> + +#ifdef __ARM_NEON +# include <arm_neon.h> +#endif + +#if defined __AVX__ && !defined __SSE4_1__ +# define __SSE4_1__ +#endif + +#if defined __SSE4_1__ || defined __AVX2__ +# ifdef _MSC_VER +# include <intrin.h> +# else +# include <x86intrin.h> +# ifdef __CYGWIN__ +# ifndef _mm256_cvtsi256_si32 +# define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) ) +# endif +# endif +# endif +#endif + +namespace tracy +{ + +static inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b ) +{ + return ( ( r & 0xF8 ) << 8 ) | ( ( g & 0xFC ) << 3 ) | ( b >> 3 ); +} + +static inline uint16_t to565( uint32_t c ) +{ + return + ( ( c & 0xF80000 ) >> 19 ) | + ( ( c & 0x00FC00 ) >> 5 ) | + ( ( c & 0x0000F8 ) << 8 ); +} + +static const uint16_t DivTable[255*3+1] = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xcccc, 0xaaaa, 0x9249, 0x8000, 0x71c7, 0x6666, 0x5d17, 0x5555, 0x4ec4, 0x4924, 0x4444, 0x4000, + 0x3c3c, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000, + 0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555, + 0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000, + 0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc, + 0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa, + 0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924, + 0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800, + 0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c, + 0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666, + 0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1, + 0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555, + 0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec, + 0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492, + 0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444, + 0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400, + 0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3, + 0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e, + 0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e, + 0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333, + 0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c, + 0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8, + 0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8, + 0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa, + 0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f, + 0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276, + 0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e, + 0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249, + 0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234, + 0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222, + 0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210, + 0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200, + 0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0, + 0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1, + 0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4, + 0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7, + 0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba, + 0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af, + 0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4, + 0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199, + 0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f, + 0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186, + 0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d, + 0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174, + 0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c, + 0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164, + 0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c, + 0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156 +}; +static const uint16_t DivTableAVX[255*3+1] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000, + 0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555, + 0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000, + 0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc, + 0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa, + 0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924, + 0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800, + 0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c, + 0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666, + 0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1, + 0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555, + 0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec, + 0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492, + 0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444, + 0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400, + 0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3, + 0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e, + 0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e, + 0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333, + 0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c, + 0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8, + 0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8, + 0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa, + 0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f, + 0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276, + 0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e, + 0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249, + 0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234, + 0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222, + 0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210, + 0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200, + 0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0, + 0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1, + 0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4, + 0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7, + 0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba, + 0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af, + 0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4, + 0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199, + 0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f, + 0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186, + 0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d, + 0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174, + 0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c, + 0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164, + 0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c, + 0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156 +}; +static const uint16_t DivTableNEON[255*3+1] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x1c71, 0x1af2, 0x1999, 0x1861, 0x1745, 0x1642, 0x1555, 0x147a, 0x13b1, 0x12f6, 0x1249, 0x11a7, 0x1111, 0x1084, 0x1000, + 0x0f83, 0x0f0f, 0x0ea0, 0x0e38, 0x0dd6, 0x0d79, 0x0d20, 0x0ccc, 0x0c7c, 0x0c30, 0x0be8, 0x0ba2, 0x0b60, 0x0b21, 0x0ae4, 0x0aaa, + 0x0a72, 0x0a3d, 0x0a0a, 0x09d8, 0x09a9, 0x097b, 0x094f, 0x0924, 0x08fb, 0x08d3, 0x08ad, 0x0888, 0x0864, 0x0842, 0x0820, 0x0800, + 0x07e0, 0x07c1, 0x07a4, 0x0787, 0x076b, 0x0750, 0x0736, 0x071c, 0x0703, 0x06eb, 0x06d3, 0x06bc, 0x06a6, 0x0690, 0x067b, 0x0666, + 0x0652, 0x063e, 0x062b, 0x0618, 0x0606, 0x05f4, 0x05e2, 0x05d1, 0x05c0, 0x05b0, 0x05a0, 0x0590, 0x0581, 0x0572, 0x0563, 0x0555, + 0x0547, 0x0539, 0x052b, 0x051e, 0x0511, 0x0505, 0x04f8, 0x04ec, 0x04e0, 0x04d4, 0x04c8, 0x04bd, 0x04b2, 0x04a7, 0x049c, 0x0492, + 0x0487, 0x047d, 0x0473, 0x0469, 0x0460, 0x0456, 0x044d, 0x0444, 0x043b, 0x0432, 0x0429, 0x0421, 0x0418, 0x0410, 0x0408, 0x0400, + 0x03f8, 0x03f0, 0x03e8, 0x03e0, 0x03d9, 0x03d2, 0x03ca, 0x03c3, 0x03bc, 0x03b5, 0x03ae, 0x03a8, 0x03a1, 0x039b, 0x0394, 0x038e, + 0x0387, 0x0381, 0x037b, 0x0375, 0x036f, 0x0369, 0x0364, 0x035e, 0x0358, 0x0353, 0x034d, 0x0348, 0x0342, 0x033d, 0x0338, 0x0333, + 0x032e, 0x0329, 0x0324, 0x031f, 0x031a, 0x0315, 0x0310, 0x030c, 0x0307, 0x0303, 0x02fe, 0x02fa, 0x02f5, 0x02f1, 0x02ec, 0x02e8, + 0x02e4, 0x02e0, 0x02dc, 0x02d8, 0x02d4, 0x02d0, 0x02cc, 0x02c8, 0x02c4, 0x02c0, 0x02bc, 0x02b9, 0x02b5, 0x02b1, 0x02ae, 0x02aa, + 0x02a7, 0x02a3, 0x02a0, 0x029c, 0x0299, 0x0295, 0x0292, 0x028f, 0x028c, 0x0288, 0x0285, 0x0282, 0x027f, 0x027c, 0x0279, 0x0276, + 0x0273, 0x0270, 0x026d, 0x026a, 0x0267, 0x0264, 0x0261, 0x025e, 0x025c, 0x0259, 0x0256, 0x0253, 0x0251, 0x024e, 0x024b, 0x0249, + 0x0246, 0x0243, 0x0241, 0x023e, 0x023c, 0x0239, 0x0237, 0x0234, 0x0232, 0x0230, 0x022d, 0x022b, 0x0229, 0x0226, 0x0224, 0x0222, + 0x021f, 0x021d, 0x021b, 0x0219, 0x0216, 0x0214, 0x0212, 0x0210, 0x020e, 0x020c, 0x020a, 0x0208, 0x0206, 0x0204, 0x0202, 0x0200, + 0x01fe, 0x01fc, 0x01fa, 0x01f8, 0x01f6, 0x01f4, 0x01f2, 0x01f0, 0x01ee, 0x01ec, 0x01ea, 0x01e9, 0x01e7, 0x01e5, 0x01e3, 0x01e1, + 0x01e0, 0x01de, 0x01dc, 0x01da, 0x01d9, 0x01d7, 0x01d5, 0x01d4, 0x01d2, 0x01d0, 0x01cf, 0x01cd, 0x01cb, 0x01ca, 0x01c8, 0x01c7, + 0x01c5, 0x01c3, 0x01c2, 0x01c0, 0x01bf, 0x01bd, 0x01bc, 0x01ba, 0x01b9, 0x01b7, 0x01b6, 0x01b4, 0x01b3, 0x01b2, 0x01b0, 0x01af, + 0x01ad, 0x01ac, 0x01aa, 0x01a9, 0x01a8, 0x01a6, 0x01a5, 0x01a4, 0x01a2, 0x01a1, 0x01a0, 0x019e, 0x019d, 0x019c, 0x019a, 0x0199, + 0x0198, 0x0197, 0x0195, 0x0194, 0x0193, 0x0192, 0x0190, 0x018f, 0x018e, 0x018d, 0x018b, 0x018a, 0x0189, 0x0188, 0x0187, 0x0186, + 0x0184, 0x0183, 0x0182, 0x0181, 0x0180, 0x017f, 0x017e, 0x017d, 0x017b, 0x017a, 0x0179, 0x0178, 0x0177, 0x0176, 0x0175, 0x0174, + 0x0173, 0x0172, 0x0171, 0x0170, 0x016f, 0x016e, 0x016d, 0x016c, 0x016b, 0x016a, 0x0169, 0x0168, 0x0167, 0x0166, 0x0165, 0x0164, + 0x0163, 0x0162, 0x0161, 0x0160, 0x015f, 0x015e, 0x015d, 0x015c, 0x015b, 0x015a, 0x0159, 0x0158, 0x0158, 0x0157, 0x0156, 0x0155, + 0x0154, 0x0153, 0x0152, 0x0151, 0x0150, 0x0150, 0x014f, 0x014e, 0x014d, 0x014c, 0x014b, 0x014a, 0x014a, 0x0149, 0x0148, 0x0147, + 0x0146, 0x0146, 0x0145, 0x0144, 0x0143, 0x0142, 0x0142, 0x0141, 0x0140, 0x013f, 0x013e, 0x013e, 0x013d, 0x013c, 0x013b, 0x013b, + 0x013a, 0x0139, 0x0138, 0x0138, 0x0137, 0x0136, 0x0135, 0x0135, 0x0134, 0x0133, 0x0132, 0x0132, 0x0131, 0x0130, 0x0130, 0x012f, + 0x012e, 0x012e, 0x012d, 0x012c, 0x012b, 0x012b, 0x012a, 0x0129, 0x0129, 0x0128, 0x0127, 0x0127, 0x0126, 0x0125, 0x0125, 0x0124, + 0x0123, 0x0123, 0x0122, 0x0121, 0x0121, 0x0120, 0x0120, 0x011f, 0x011e, 0x011e, 0x011d, 0x011c, 0x011c, 0x011b, 0x011b, 0x011a, + 0x0119, 0x0119, 0x0118, 0x0118, 0x0117, 0x0116, 0x0116, 0x0115, 0x0115, 0x0114, 0x0113, 0x0113, 0x0112, 0x0112, 0x0111, 0x0111, + 0x0110, 0x010f, 0x010f, 0x010e, 0x010e, 0x010d, 0x010d, 0x010c, 0x010c, 0x010b, 0x010a, 0x010a, 0x0109, 0x0109, 0x0108, 0x0108, + 0x0107, 0x0107, 0x0106, 0x0106, 0x0105, 0x0105, 0x0104, 0x0104, 0x0103, 0x0103, 0x0102, 0x0102, 0x0101, 0x0101, 0x0100, 0x0100, + 0x00ff, 0x00ff, 0x00fe, 0x00fe, 0x00fd, 0x00fd, 0x00fc, 0x00fc, 0x00fb, 0x00fb, 0x00fa, 0x00fa, 0x00f9, 0x00f9, 0x00f8, 0x00f8, + 0x00f7, 0x00f7, 0x00f6, 0x00f6, 0x00f5, 0x00f5, 0x00f4, 0x00f4, 0x00f4, 0x00f3, 0x00f3, 0x00f2, 0x00f2, 0x00f1, 0x00f1, 0x00f0, + 0x00f0, 0x00f0, 0x00ef, 0x00ef, 0x00ee, 0x00ee, 0x00ed, 0x00ed, 0x00ed, 0x00ec, 0x00ec, 0x00eb, 0x00eb, 0x00ea, 0x00ea, 0x00ea, + 0x00e9, 0x00e9, 0x00e8, 0x00e8, 0x00e7, 0x00e7, 0x00e7, 0x00e6, 0x00e6, 0x00e5, 0x00e5, 0x00e5, 0x00e4, 0x00e4, 0x00e3, 0x00e3, + 0x00e3, 0x00e2, 0x00e2, 0x00e1, 0x00e1, 0x00e1, 0x00e0, 0x00e0, 0x00e0, 0x00df, 0x00df, 0x00de, 0x00de, 0x00de, 0x00dd, 0x00dd, + 0x00dd, 0x00dc, 0x00dc, 0x00db, 0x00db, 0x00db, 0x00da, 0x00da, 0x00da, 0x00d9, 0x00d9, 0x00d9, 0x00d8, 0x00d8, 0x00d7, 0x00d7, + 0x00d7, 0x00d6, 0x00d6, 0x00d6, 0x00d5, 0x00d5, 0x00d5, 0x00d4, 0x00d4, 0x00d4, 0x00d3, 0x00d3, 0x00d3, 0x00d2, 0x00d2, 0x00d2, + 0x00d1, 0x00d1, 0x00d1, 0x00d0, 0x00d0, 0x00d0, 0x00cf, 0x00cf, 0x00cf, 0x00ce, 0x00ce, 0x00ce, 0x00cd, 0x00cd, 0x00cd, 0x00cc, + 0x00cc, 0x00cc, 0x00cb, 0x00cb, 0x00cb, 0x00ca, 0x00ca, 0x00ca, 0x00c9, 0x00c9, 0x00c9, 0x00c9, 0x00c8, 0x00c8, 0x00c8, 0x00c7, + 0x00c7, 0x00c7, 0x00c6, 0x00c6, 0x00c6, 0x00c5, 0x00c5, 0x00c5, 0x00c5, 0x00c4, 0x00c4, 0x00c4, 0x00c3, 0x00c3, 0x00c3, 0x00c3, + 0x00c2, 0x00c2, 0x00c2, 0x00c1, 0x00c1, 0x00c1, 0x00c1, 0x00c0, 0x00c0, 0x00c0, 0x00bf, 0x00bf, 0x00bf, 0x00bf, 0x00be, 0x00be, + 0x00be, 0x00bd, 0x00bd, 0x00bd, 0x00bd, 0x00bc, 0x00bc, 0x00bc, 0x00bc, 0x00bb, 0x00bb, 0x00bb, 0x00ba, 0x00ba, 0x00ba, 0x00ba, + 0x00b9, 0x00b9, 0x00b9, 0x00b9, 0x00b8, 0x00b8, 0x00b8, 0x00b8, 0x00b7, 0x00b7, 0x00b7, 0x00b7, 0x00b6, 0x00b6, 0x00b6, 0x00b6, + 0x00b5, 0x00b5, 0x00b5, 0x00b5, 0x00b4, 0x00b4, 0x00b4, 0x00b4, 0x00b3, 0x00b3, 0x00b3, 0x00b3, 0x00b2, 0x00b2, 0x00b2, 0x00b2, + 0x00b1, 0x00b1, 0x00b1, 0x00b1, 0x00b0, 0x00b0, 0x00b0, 0x00b0, 0x00af, 0x00af, 0x00af, 0x00af, 0x00ae, 0x00ae, 0x00ae, 0x00ae, + 0x00ae, 0x00ad, 0x00ad, 0x00ad, 0x00ad, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ab, 0x00ab, 0x00ab, 0x00ab, +}; + + +static tracy_force_inline uint64_t ProcessRGB( const uint8_t* src ) +{ +#ifdef __SSE4_1__ + __m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0); + __m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1); + __m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2); + __m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3); + + __m128i smask = _mm_set1_epi32( 0xF8FCF8 ); + __m128i sd0 = _mm_and_si128( px0, smask ); + __m128i sd1 = _mm_and_si128( px1, smask ); + __m128i sd2 = _mm_and_si128( px2, smask ); + __m128i sd3 = _mm_and_si128( px3, smask ); + + __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0)); + + __m128i sc0 = _mm_cmpeq_epi8(sd0, sc); + __m128i sc1 = _mm_cmpeq_epi8(sd1, sc); + __m128i sc2 = _mm_cmpeq_epi8(sd2, sc); + __m128i sc3 = _mm_cmpeq_epi8(sd3, sc); + + __m128i sm0 = _mm_and_si128(sc0, sc1); + __m128i sm1 = _mm_and_si128(sc2, sc3); + __m128i sm = _mm_and_si128(sm0, sm1); + + if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) ) + { + return uint64_t( to565( src[0], src[1], src[2] ) ) << 16; + } + + __m128i min0 = _mm_min_epu8( px0, px1 ); + __m128i min1 = _mm_min_epu8( px2, px3 ); + __m128i min2 = _mm_min_epu8( min0, min1 ); + + __m128i max0 = _mm_max_epu8( px0, px1 ); + __m128i max1 = _mm_max_epu8( px2, px3 ); + __m128i max2 = _mm_max_epu8( max0, max1 ); + + __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) ); + __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) ); + __m128i min4 = _mm_min_epu8( min2, min3 ); + __m128i max4 = _mm_max_epu8( max2, max3 ); + + __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) ); + __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) ); + __m128i rmin = _mm_min_epu8( min4, min5 ); + __m128i rmax = _mm_max_epu8( max4, max5 ); + + __m128i range1 = _mm_subs_epu8( rmax, rmin ); + __m128i range2 = _mm_sad_epu8( rmax, rmin ); + + uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1; + __m128i range = _mm_set1_epi16( DivTable[vrange] ); + + __m128i inset1 = _mm_srli_epi16( range1, 4 ); + __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) ); + __m128i min = _mm_adds_epu8( rmin, inset ); + __m128i max = _mm_subs_epu8( rmax, inset ); + + __m128i c0 = _mm_subs_epu8( px0, rmin ); + __m128i c1 = _mm_subs_epu8( px1, rmin ); + __m128i c2 = _mm_subs_epu8( px2, rmin ); + __m128i c3 = _mm_subs_epu8( px3, rmin ); + + __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) ); + __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) ); + __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) ); + __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) ); + + __m128i s0 = _mm_hadd_epi16( is0, is1 ); + __m128i s1 = _mm_hadd_epi16( is2, is3 ); + + __m128i m0 = _mm_mulhi_epu16( s0, range ); + __m128i m1 = _mm_mulhi_epu16( s1, range ); + + __m128i p0 = _mm_packus_epi16( m0, m1 ); + + __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) ); + __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 ); + __m128i p3 = _mm_or_si128( p1, p2 ); + __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) ); + + uint32_t vmin = _mm_cvtsi128_si32( min ); + uint32_t vmax = _mm_cvtsi128_si32( max ); + uint32_t vp = _mm_cvtsi128_si32( p ); + + return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) ); +#elif defined __ARM_NEON +# ifdef __aarch64__ + uint8x16x4_t px = vld4q_u8( src ); + + uint8x16_t lr = px.val[0]; + uint8x16_t lg = px.val[1]; + uint8x16_t lb = px.val[2]; + + uint8_t rmaxr = vmaxvq_u8( lr ); + uint8_t rmaxg = vmaxvq_u8( lg ); + uint8_t rmaxb = vmaxvq_u8( lb ); + + uint8_t rminr = vminvq_u8( lr ); + uint8_t rming = vminvq_u8( lg ); + uint8_t rminb = vminvq_u8( lb ); + + int rr = rmaxr - rminr; + int rg = rmaxg - rming; + int rb = rmaxb - rminb; + + int vrange1 = rr + rg + rb; + uint16_t vrange2 = DivTableNEON[vrange1]; + + uint8_t insetr = rr >> 4; + uint8_t insetg = rg >> 4; + uint8_t insetb = rb >> 4; + + uint8_t minr = rminr + insetr; + uint8_t ming = rming + insetg; + uint8_t minb = rminb + insetb; + + uint8_t maxr = rmaxr - insetr; + uint8_t maxg = rmaxg - insetg; + uint8_t maxb = rmaxb - insetb; + + uint8x16_t cr = vsubq_u8( lr, vdupq_n_u8( rminr ) ); + uint8x16_t cg = vsubq_u8( lg, vdupq_n_u8( rming ) ); + uint8x16_t cb = vsubq_u8( lb, vdupq_n_u8( rminb ) ); + + uint16x8_t is0l = vaddl_u8( vget_low_u8( cr ), vget_low_u8( cg ) ); + uint16x8_t is0h = vaddl_u8( vget_high_u8( cr ), vget_high_u8( cg ) ); + uint16x8_t is1l = vaddw_u8( is0l, vget_low_u8( cb ) ); + uint16x8_t is1h = vaddw_u8( is0h, vget_high_u8( cb ) ); + + int16x8_t range = vdupq_n_s16( vrange2 ); + uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1l ), range ) ); + uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1h ), range ) ); + + uint8x8_t p00 = vmovn_u16( m0 ); + uint8x8_t p01 = vmovn_u16( m1 ); + uint8x16_t p0 = vcombine_u8( p00, p01 ); + + uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) ); + uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) ); + uint32x4_t p3 = vaddq_u32( p1, p2 ); + + uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) ); + uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) ); + + uint32_t vp; + vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 ); + + return uint64_t( ( uint64_t( to565( minr, ming, minb ) ) << 16 ) | to565( maxr, maxg, maxb ) | ( uint64_t( vp ) << 32 ) ); +# else + uint32x4_t px0 = vld1q_u32( (uint32_t*)src ); + uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 ); + uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 ); + uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 ); + + uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 ); + uint32x4_t sd0 = vandq_u32( smask, px0 ); + uint32x4_t sd1 = vandq_u32( smask, px1 ); + uint32x4_t sd2 = vandq_u32( smask, px2 ); + uint32x4_t sd3 = vandq_u32( smask, px3 ); + + uint32x4_t sc = vdupq_n_u32( sd0[0] ); + + uint32x4_t sc0 = vceqq_u32( sd0, sc ); + uint32x4_t sc1 = vceqq_u32( sd1, sc ); + uint32x4_t sc2 = vceqq_u32( sd2, sc ); + uint32x4_t sc3 = vceqq_u32( sd3, sc ); + + uint32x4_t sm0 = vandq_u32( sc0, sc1 ); + uint32x4_t sm1 = vandq_u32( sc2, sc3 ); + int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) ); + + if( sm[0] == -1 && sm[1] == -1 ) + { + return uint64_t( to565( src[0], src[1], src[2] ) ) << 16; + } + + uint32x4_t mask = vdupq_n_u32( 0xFFFFFF ); + uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) ); + uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) ); + uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) ); + uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) ); + + uint8x16_t min0 = vminq_u8( l0, l1 ); + uint8x16_t min1 = vminq_u8( l2, l3 ); + uint8x16_t min2 = vminq_u8( min0, min1 ); + + uint8x16_t max0 = vmaxq_u8( l0, l1 ); + uint8x16_t max1 = vmaxq_u8( l2, l3 ); + uint8x16_t max2 = vmaxq_u8( max0, max1 ); + + uint8x16_t min3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( min2 ) ) ); + uint8x16_t max3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( max2 ) ) ); + + uint8x16_t min4 = vminq_u8( min2, min3 ); + uint8x16_t max4 = vmaxq_u8( max2, max3 ); + + uint8x16_t min5 = vcombine_u8( vget_high_u8( min4 ), vget_low_u8( min4 ) ); + uint8x16_t max5 = vcombine_u8( vget_high_u8( max4 ), vget_low_u8( max4 ) ); + + uint8x16_t rmin = vminq_u8( min4, min5 ); + uint8x16_t rmax = vmaxq_u8( max4, max5 ); + + uint8x16_t range1 = vsubq_u8( rmax, rmin ); + uint8x8_t range2 = vget_low_u8( range1 ); + uint8x8x2_t range3 = vzip_u8( range2, vdup_n_u8( 0 ) ); + uint16x4_t range4 = vreinterpret_u16_u8( range3.val[0] ); + + uint16_t vrange1; + uint16x4_t range5 = vpadd_u16( range4, range4 ); + uint16x4_t range6 = vpadd_u16( range5, range5 ); + vst1_lane_u16( &vrange1, range6, 0 ); + + uint32_t vrange2 = ( 2 << 16 ) / uint32_t( vrange1 + 1 ); + uint16x8_t range = vdupq_n_u16( vrange2 ); + + uint8x16_t inset = vshrq_n_u8( range1, 4 ); + uint8x16_t min = vaddq_u8( rmin, inset ); + uint8x16_t max = vsubq_u8( rmax, inset ); + + uint8x16_t c0 = vsubq_u8( l0, rmin ); + uint8x16_t c1 = vsubq_u8( l1, rmin ); + uint8x16_t c2 = vsubq_u8( l2, rmin ); + uint8x16_t c3 = vsubq_u8( l3, rmin ); + + uint16x8_t is0 = vpaddlq_u8( c0 ); + uint16x8_t is1 = vpaddlq_u8( c1 ); + uint16x8_t is2 = vpaddlq_u8( c2 ); + uint16x8_t is3 = vpaddlq_u8( c3 ); + + uint16x4_t is4 = vpadd_u16( vget_low_u16( is0 ), vget_high_u16( is0 ) ); + uint16x4_t is5 = vpadd_u16( vget_low_u16( is1 ), vget_high_u16( is1 ) ); + uint16x4_t is6 = vpadd_u16( vget_low_u16( is2 ), vget_high_u16( is2 ) ); + uint16x4_t is7 = vpadd_u16( vget_low_u16( is3 ), vget_high_u16( is3 ) ); + + uint16x8_t s0 = vcombine_u16( is4, is5 ); + uint16x8_t s1 = vcombine_u16( is6, is7 ); + + uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s0 ), vreinterpretq_s16_u16( range ) ) ); + uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s1 ), vreinterpretq_s16_u16( range ) ) ); + + uint8x8_t p00 = vmovn_u16( m0 ); + uint8x8_t p01 = vmovn_u16( m1 ); + uint8x16_t p0 = vcombine_u8( p00, p01 ); + + uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) ); + uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) ); + uint32x4_t p3 = vaddq_u32( p1, p2 ); + + uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) ); + uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) ); + + uint32_t vmin, vmax, vp; + vst1q_lane_u32( &vmin, vreinterpretq_u32_u8( min ), 0 ); + vst1q_lane_u32( &vmax, vreinterpretq_u32_u8( max ), 0 ); + vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 ); + + return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) ); +# endif +#else + const auto ref = to565( src[0], src[1], src[2] ); + auto stmp = src + 4; + for( int i=1; i<16; i++ ) + { + if( to565( stmp[0], stmp[1], stmp[2] ) != ref ) + { + break; + } + stmp += 4; + } + if( stmp == src + 64 ) + { + return uint64_t( ref ) << 16; + } + + uint8_t min[3] = { src[0], src[1], src[2] }; + uint8_t max[3] = { src[0], src[1], src[2] }; + auto tmp = src + 4; + for( int i=1; i<16; i++ ) + { + for( int j=0; j<3; j++ ) + { + if( tmp[j] < min[j] ) min[j] = tmp[j]; + else if( tmp[j] > max[j] ) max[j] = tmp[j]; + } + tmp += 4; + } + + const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]]; + const uint32_t rmin = min[0] + min[1] + min[2]; + for( int i=0; i<3; i++ ) + { + const uint8_t inset = ( max[i] - min[i] ) >> 4; + min[i] += inset; + max[i] -= inset; + } + + uint32_t data = 0; + for( int i=0; i<16; i++ ) + { + const uint32_t c = src[0] + src[1] + src[2] - rmin; + const uint8_t idx = ( c * range ) >> 16; + data |= idx << (i*2); + src += 4; + } + + return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) ); +#endif +} + +#ifdef __AVX2__ +static tracy_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst ) +{ + __m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0); + __m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1); + __m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2); + __m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3); + + __m256i min0 = _mm256_min_epu8( px0, px1 ); + __m256i min1 = _mm256_min_epu8( px2, px3 ); + __m256i min2 = _mm256_min_epu8( min0, min1 ); + + __m256i max0 = _mm256_max_epu8( px0, px1 ); + __m256i max1 = _mm256_max_epu8( px2, px3 ); + __m256i max2 = _mm256_max_epu8( max0, max1 ); + + __m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) ); + __m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) ); + __m256i min4 = _mm256_min_epu8( min2, min3 ); + __m256i max4 = _mm256_max_epu8( max2, max3 ); + + __m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) ); + __m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) ); + __m256i rmin = _mm256_min_epu8( min4, min5 ); + __m256i rmax = _mm256_max_epu8( max4, max5 ); + + __m256i range1 = _mm256_subs_epu8( rmax, rmin ); + __m256i range2 = _mm256_sad_epu8( rmax, rmin ); + + uint16_t vrange0 = DivTableAVX[_mm256_cvtsi256_si32( range2 ) >> 1]; + uint16_t vrange1 = DivTableAVX[_mm256_extract_epi16( range2, 8 ) >> 1]; + __m256i range00 = _mm256_set1_epi16( vrange0 ); + __m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 ); + + __m256i inset1 = _mm256_srli_epi16( range1, 4 ); + __m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) ); + __m256i min = _mm256_adds_epu8( rmin, inset ); + __m256i max = _mm256_subs_epu8( rmax, inset ); + + __m256i c0 = _mm256_subs_epu8( px0, rmin ); + __m256i c1 = _mm256_subs_epu8( px1, rmin ); + __m256i c2 = _mm256_subs_epu8( px2, rmin ); + __m256i c3 = _mm256_subs_epu8( px3, rmin ); + + __m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) ); + __m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) ); + __m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) ); + __m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) ); + + __m256i s0 = _mm256_hadd_epi16( is0, is1 ); + __m256i s1 = _mm256_hadd_epi16( is2, is3 ); + + __m256i m0 = _mm256_mulhi_epu16( s0, range ); + __m256i m1 = _mm256_mulhi_epu16( s1, range ); + + __m256i p0 = _mm256_packus_epi16( m0, m1 ); + + __m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) ); + __m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 ); + __m256i p3 = _mm256_or_si256( p1, p2 ); + __m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) ); + + __m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min ); + __m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max ); + __m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 ); + __m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 ); + __m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 ); + __m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 ); + __m256i mm3 = _mm256_or_si256( mmr, mmg ); + __m256i mm4 = _mm256_or_si256( mm3, mmb ); + __m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) ); + + __m256i d0 = _mm256_unpacklo_epi32( mm5, p ); + __m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) ); + _mm_storeu_si128( (__m128i*)dst, _mm256_castsi256_si128( d1 ) ); + dst += 16; +} +#endif + +void CompressImageDxt1( const char* src, char* dst, int w, int h ) +{ + assert( (w % 4) == 0 && (h % 4) == 0 ); + +#ifdef __AVX2__ + if( w%8 == 0 ) + { + uint32_t buf[8*4]; + int i = 0; + + auto blocks = w * h / 32; + do + { + auto tmp = (char*)buf; + memcpy( tmp, src, 8*4 ); + memcpy( tmp + 8*4, src + w * 4, 8*4 ); + memcpy( tmp + 16*4, src + w * 8, 8*4 ); + memcpy( tmp + 24*4, src + w * 12, 8*4 ); + src += 8*4; + if( ++i == w/8 ) + { + src += w * 3 * 4; + i = 0; + } + + ProcessRGB_AVX( (uint8_t*)buf, dst ); + } + while( --blocks ); + } + else +#endif + { + uint32_t buf[4*4]; + int i = 0; + + auto ptr = dst; + auto blocks = w * h / 16; + do + { + auto tmp = (char*)buf; + memcpy( tmp, src, 4*4 ); + memcpy( tmp + 4*4, src + w * 4, 4*4 ); + memcpy( tmp + 8*4, src + w * 8, 4*4 ); + memcpy( tmp + 12*4, src + w * 12, 4*4 ); + src += 4*4; + if( ++i == w/4 ) + { + src += w * 3 * 4; + i = 0; + } + + const auto c = ProcessRGB( (uint8_t*)buf ); + memcpy( ptr, &c, sizeof( uint64_t ) ); + ptr += sizeof( uint64_t ); + } + while( --blocks ); + } +} + +} diff --git a/libs/tracy/client/TracyDxt1.hpp b/libs/tracy/client/TracyDxt1.hpp @@ -0,0 +1,11 @@ +#ifndef __TRACYDXT1_HPP__ +#define __TRACYDXT1_HPP__ + +namespace tracy +{ + +void CompressImageDxt1( const char* src, char* dst, int w, int h ); + +} + +#endif diff --git a/libs/tracy/client/TracyFastVector.hpp b/libs/tracy/client/TracyFastVector.hpp @@ -0,0 +1,116 @@ +#ifndef __TRACYFASTVECTOR_HPP__ +#define __TRACYFASTVECTOR_HPP__ + +#include <stddef.h> + +#include "../common/TracyAlloc.hpp" +#include "../common/TracyForceInline.hpp" + +namespace tracy +{ + +template<typename T> +class FastVector +{ +public: + using iterator = T*; + using const_iterator = const T*; + + FastVector( size_t capacity ) + : m_ptr( (T*)tracy_malloc( sizeof( T ) * capacity ) ) + , m_write( m_ptr ) + , m_end( m_ptr + capacity ) + { + } + + FastVector( const FastVector& ) = delete; + FastVector( FastVector&& ) = delete; + + ~FastVector() + { + tracy_free( m_ptr ); + } + + FastVector& operator=( const FastVector& ) = delete; + FastVector& operator=( FastVector&& ) = delete; + + bool empty() const { return m_ptr == m_write; } + size_t size() const { return m_write - m_ptr; } + + T* data() { return m_ptr; } + const T* data() const { return m_ptr; }; + + T* begin() { return m_ptr; } + const T* begin() const { return m_ptr; } + T* end() { return m_write; } + const T* end() const { return m_write; } + + T& front() { assert( !empty() ); return m_ptr[0]; } + const T& front() const { assert( !empty() ); return m_ptr[0]; } + + T& back() { assert( !empty() ); return m_write[-1]; } + const T& back() const { assert( !empty() ); return m_write[-1]; } + + T& operator[]( size_t idx ) { return m_ptr[idx]; } + const T& operator[]( size_t idx ) const { return m_ptr[idx]; } + + T* push_next() + { + if( m_write == m_end ) AllocMore(); + return m_write++; + } + + T* prepare_next() + { + if( m_write == m_end ) AllocMore(); + return m_write; + } + + void commit_next() + { + m_write++; + } + + void clear() + { + m_write = m_ptr; + } + + void swap( FastVector& vec ) + { + const auto ptr1 = m_ptr; + const auto ptr2 = vec.m_ptr; + const auto write1 = m_write; + const auto write2 = vec.m_write; + const auto end1 = m_end; + const auto end2 = vec.m_end; + + m_ptr = ptr2; + vec.m_ptr = ptr1; + m_write = write2; + vec.m_write = write1; + m_end = end2; + vec.m_end = end1; + } + +private: + tracy_no_inline void AllocMore() + { + const auto cap = ( m_end - m_ptr ) * 2; + const auto size = m_write - m_ptr; + T* ptr = (T*)tracy_malloc( sizeof( T ) * cap ); + memcpy( ptr, m_ptr, size * sizeof( T ) ); + tracy_free( m_ptr ); + m_ptr = ptr; + m_write = m_ptr + size; + m_end = m_ptr + cap; + } + + T* m_ptr; + T* m_write; + T* m_end; +}; + +} + +#endif diff --git a/libs/tracy/client/TracyLock.hpp b/libs/tracy/client/TracyLock.hpp @@ -0,0 +1,527 @@ +#ifndef __TRACYLOCK_HPP__ +#define __TRACYLOCK_HPP__ + +#include <atomic> +#include <limits> + +#include "../common/TracySystem.hpp" +#include "../common/TracyAlign.hpp" +#include "TracyProfiler.hpp" + +namespace tracy +{ + +class LockableCtx +{ +public: + tracy_force_inline LockableCtx( const SourceLocationData* srcloc ) + : m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) ) +#ifdef TRACY_ON_DEMAND + , m_lockCount( 0 ) + , m_active( false ) +#endif + { + assert( m_id != std::numeric_limits<uint32_t>::max() ); + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::LockAnnounce ); + MemWrite( &item->lockAnnounce.id, m_id ); + MemWrite( &item->lockAnnounce.time, Profiler::GetTime() ); + MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc ); + MemWrite( &item->lockAnnounce.type, LockType::Lockable ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + tail.store( magic + 1, std::memory_order_release ); + } + + LockableCtx( const LockableCtx& ) = delete; + LockableCtx& operator=( const LockableCtx& ) = delete; + + tracy_force_inline ~LockableCtx() + { + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::LockTerminate ); + MemWrite( &item->lockTerminate.id, m_id ); + MemWrite( &item->lockTerminate.time, Profiler::GetTime() ); + MemWrite( &item->lockTerminate.type, LockType::Lockable ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + tail.store( magic + 1, std::memory_order_release ); + } + + tracy_force_inline bool BeforeLock() + { +#ifdef TRACY_ON_DEMAND + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return false; +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockWait ); + MemWrite( &item->lockWait.thread, GetThreadHandle() ); + MemWrite( &item->lockWait.id, m_id ); + MemWrite( &item->lockWait.time, Profiler::GetTime() ); + MemWrite( &item->lockWait.type, LockType::Lockable ); + Profiler::QueueSerialFinish(); + return true; + } + + tracy_force_inline void AfterLock() + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterUnlock() + { +#ifdef TRACY_ON_DEMAND + m_lockCount.fetch_sub( 1, std::memory_order_relaxed ); + if( !m_active.load( std::memory_order_relaxed ) ) return; + if( !GetProfiler().IsConnected() ) + { + m_active.store( false, std::memory_order_relaxed ); + return; + } +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockRelease ); + MemWrite( &item->lockRelease.thread, GetThreadHandle() ); + MemWrite( &item->lockRelease.id, m_id ); + MemWrite( &item->lockRelease.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterTryLock( bool acquired ) + { +#ifdef TRACY_ON_DEMAND + if( !acquired ) return; + + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return; +#endif + + if( acquired ) + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + } + + tracy_force_inline void Mark( const SourceLocationData* srcloc ) + { +#ifdef TRACY_ON_DEMAND + const auto active = m_active.load( std::memory_order_relaxed ); + if( !active ) return; + const auto connected = GetProfiler().IsConnected(); + if( !connected ) + { + if( active ) m_active.store( false, std::memory_order_relaxed ); + return; + } +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockMark ); + MemWrite( &item->lockMark.thread, GetThreadHandle() ); + MemWrite( &item->lockMark.id, m_id ); + MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc ); + Profiler::QueueSerialFinish(); + } + +private: + uint32_t m_id; + +#ifdef TRACY_ON_DEMAND + std::atomic<uint32_t> m_lockCount; + std::atomic<bool> m_active; +#endif +}; + +template<class T> +class Lockable +{ +public: + tracy_force_inline Lockable( const SourceLocationData* srcloc ) + : m_ctx( srcloc ) + { + } + + Lockable( const Lockable& ) = delete; + Lockable& operator=( const Lockable& ) = delete; + + tracy_force_inline void lock() + { + const auto runAfter = m_ctx.BeforeLock(); + m_lockable.lock(); + if( runAfter ) m_ctx.AfterLock(); + } + + tracy_force_inline void unlock() + { + m_lockable.unlock(); + m_ctx.AfterUnlock(); + } + + tracy_force_inline bool try_lock() + { + const auto acquired = m_lockable.try_lock(); + m_ctx.AfterTryLock( acquired ); + return acquired; + } + + tracy_force_inline void Mark( const SourceLocationData* srcloc ) + { + m_ctx.Mark( srcloc ); + } + +private: + T m_lockable; + LockableCtx m_ctx; +}; + + +class SharedLockableCtx +{ +public: + tracy_force_inline SharedLockableCtx( const SourceLocationData* srcloc ) + : m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) ) +#ifdef TRACY_ON_DEMAND + , m_lockCount( 0 ) + , m_active( false ) +#endif + { + assert( m_id != std::numeric_limits<uint32_t>::max() ); + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::LockAnnounce ); + MemWrite( &item->lockAnnounce.id, m_id ); + MemWrite( &item->lockAnnounce.time, Profiler::GetTime() ); + MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc ); + MemWrite( &item->lockAnnounce.type, LockType::SharedLockable ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + tail.store( magic + 1, std::memory_order_release ); + } + + SharedLockableCtx( const SharedLockableCtx& ) = delete; + SharedLockableCtx& operator=( const SharedLockableCtx& ) = delete; + + tracy_force_inline ~SharedLockableCtx() + { + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::LockTerminate ); + MemWrite( &item->lockTerminate.id, m_id ); + MemWrite( &item->lockTerminate.time, Profiler::GetTime() ); + MemWrite( &item->lockTerminate.type, LockType::SharedLockable ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + tail.store( magic + 1, std::memory_order_release ); + } + + tracy_force_inline bool BeforeLock() + { +#ifdef TRACY_ON_DEMAND + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return false; +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockWait ); + MemWrite( &item->lockWait.thread, GetThreadHandle() ); + MemWrite( &item->lockWait.id, m_id ); + MemWrite( &item->lockWait.time, Profiler::GetTime() ); + MemWrite( &item->lockWait.type, LockType::SharedLockable ); + Profiler::QueueSerialFinish(); + return true; + } + + tracy_force_inline void AfterLock() + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterUnlock() + { +#ifdef TRACY_ON_DEMAND + m_lockCount.fetch_sub( 1, std::memory_order_relaxed ); + if( !m_active.load( std::memory_order_relaxed ) ) return; + if( !GetProfiler().IsConnected() ) + { + m_active.store( false, std::memory_order_relaxed ); + return; + } +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockRelease ); + MemWrite( &item->lockRelease.thread, GetThreadHandle() ); + MemWrite( &item->lockRelease.id, m_id ); + MemWrite( &item->lockRelease.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterTryLock( bool acquired ) + { +#ifdef TRACY_ON_DEMAND + if( !acquired ) return; + + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return; +#endif + + if( acquired ) + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + } + + tracy_force_inline bool BeforeLockShared() + { +#ifdef TRACY_ON_DEMAND + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return false; +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockSharedWait ); + MemWrite( &item->lockWait.thread, GetThreadHandle() ); + MemWrite( &item->lockWait.id, m_id ); + MemWrite( &item->lockWait.time, Profiler::GetTime() ); + MemWrite( &item->lockWait.type, LockType::SharedLockable ); + Profiler::QueueSerialFinish(); + return true; + } + + tracy_force_inline void AfterLockShared() + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockSharedObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterUnlockShared() + { +#ifdef TRACY_ON_DEMAND + m_lockCount.fetch_sub( 1, std::memory_order_relaxed ); + if( !m_active.load( std::memory_order_relaxed ) ) return; + if( !GetProfiler().IsConnected() ) + { + m_active.store( false, std::memory_order_relaxed ); + return; + } +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockSharedRelease ); + MemWrite( &item->lockRelease.thread, GetThreadHandle() ); + MemWrite( &item->lockRelease.id, m_id ); + MemWrite( &item->lockRelease.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterTryLockShared( bool acquired ) + { +#ifdef TRACY_ON_DEMAND + if( !acquired ) return; + + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return; +#endif + + if( acquired ) + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockSharedObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + } + + tracy_force_inline void Mark( const SourceLocationData* srcloc ) + { +#ifdef TRACY_ON_DEMAND + const auto active = m_active.load( std::memory_order_relaxed ); + if( !active ) return; + const auto connected = GetProfiler().IsConnected(); + if( !connected ) + { + if( active ) m_active.store( false, std::memory_order_relaxed ); + return; + } +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockMark ); + MemWrite( &item->lockMark.thread, GetThreadHandle() ); + MemWrite( &item->lockMark.id, m_id ); + MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc ); + Profiler::QueueSerialFinish(); + } + +private: + uint32_t m_id; + +#ifdef TRACY_ON_DEMAND + std::atomic<uint32_t> m_lockCount; + std::atomic<bool> m_active; +#endif +}; + +template<class T> +class SharedLockable +{ +public: + tracy_force_inline SharedLockable( const SourceLocationData* srcloc ) + : m_ctx( srcloc ) + { + } + + SharedLockable( const SharedLockable& ) = delete; + SharedLockable& operator=( const SharedLockable& ) = delete; + + tracy_force_inline void lock() + { + const auto runAfter = m_ctx.BeforeLock(); + m_lockable.lock(); + if( runAfter ) m_ctx.AfterLock(); + } + + tracy_force_inline void unlock() + { + m_lockable.unlock(); + m_ctx.AfterUnlock(); + } + + tracy_force_inline bool try_lock() + { + const auto acquired = m_lockable.try_lock(); + m_ctx.AfterTryLock( acquired ); + return acquired; + } + + tracy_force_inline void lock_shared() + { + const auto runAfter = m_ctx.BeforeLockShared(); + m_lockable.lock_shared(); + if( runAfter ) m_ctx.AfterLockShared(); + } + + tracy_force_inline void unlock_shared() + { + m_lockable.unlock_shared(); + m_ctx.AfterUnlockShared(); + } + + tracy_force_inline bool try_lock_shared() + { + const auto acquired = m_lockable.try_lock_shared(); + m_ctx.AfterTryLockShared( acquired ); + return acquired; + } + + tracy_force_inline void Mark( const SourceLocationData* srcloc ) + { + m_ctx.Mark( srcloc ); + } + +private: + T m_lockable; + SharedLockableCtx m_ctx; +}; + + +}; + +#endif diff --git a/libs/tracy/client/TracyProfiler.cpp b/libs/tracy/client/TracyProfiler.cpp @@ -0,0 +1,2729 @@ +#ifdef TRACY_ENABLE + +#ifdef _WIN32 +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include <winsock2.h> +# include <windows.h> +# include <tlhelp32.h> +# include <inttypes.h> +# include <intrin.h> +#else +# include <sys/time.h> +# include <sys/param.h> +#endif + +#ifdef __CYGWIN__ +# include <windows.h> +# include <unistd.h> +# include <tlhelp32.h> +#endif + +#ifdef _GNU_SOURCE +# include <errno.h> +#endif + +#ifdef __linux__ +# include <dirent.h> +# include <signal.h> +# include <pthread.h> +# include <sys/types.h> +# include <sys/syscall.h> +#endif + +#if defined __APPLE__ || defined BSD +# include <sys/types.h> +# include <sys/sysctl.h> +#endif + +#include <algorithm> +#include <assert.h> +#include <atomic> +#include <chrono> +#include <limits> +#include <new> +#include <stdlib.h> +#include <string.h> +#include <thread> + +#include "../common/TracyAlign.hpp" +#include "../common/TracyProtocol.hpp" +#include "../common/TracySocket.hpp" +#include "../common/TracySystem.hpp" +#include "../common/tracy_lz4.hpp" +#include "tracy_rpmalloc.hpp" +#include "TracyCallstack.hpp" +#include "TracyDxt1.hpp" +#include "TracyScoped.hpp" +#include "TracyProfiler.hpp" +#include "TracyThread.hpp" +#include "TracyArmCpuTable.hpp" +#include "TracySysTrace.hpp" +#include "../TracyC.h" + +#ifdef __APPLE__ +# define TRACY_DELAYED_INIT +#else +# ifdef __GNUC__ +# define init_order( val ) __attribute__ ((init_priority(val))) +# else +# define init_order(x) +# endif +#endif + +#if defined TRACY_HW_TIMER && __ARM_ARCH >= 6 && !defined TARGET_OS_IOS +# include <signal.h> +# include <setjmp.h> +#endif + +#if defined _WIN32 || defined __CYGWIN__ +# include <lmcons.h> +extern "C" typedef LONG (WINAPI *t_RtlGetVersion)( PRTL_OSVERSIONINFOW ); +#else +# include <unistd.h> +# include <limits.h> +#endif +#if defined __APPLE__ +# include "TargetConditionals.h" +#endif +#if defined __linux__ +# include <sys/sysinfo.h> +# include <sys/utsname.h> +#endif + +#if !defined _WIN32 && !defined __CYGWIN__ && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) +# include <cpuid.h> +#endif + +#if !( ( ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA ) || defined __linux__ ) +# include <mutex> +#endif + +namespace tracy +{ + +#ifndef TRACY_DELAYED_INIT +namespace +{ +# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA + BOOL CALLBACK InitOnceCallback( PINIT_ONCE /*initOnce*/, PVOID /*Parameter*/, PVOID* /*Context*/) + { + rpmalloc_initialize(); + return TRUE; + } + INIT_ONCE InitOnce = INIT_ONCE_STATIC_INIT; +# elif defined __linux__ + void InitOnceCallback() + { + rpmalloc_initialize(); + } + pthread_once_t once_control = PTHREAD_ONCE_INIT; +# else + void InitOnceCallback() + { + rpmalloc_initialize(); + } + std::once_flag once_flag; +# endif +} + +struct RPMallocInit +{ + RPMallocInit() + { +# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA + InitOnceExecuteOnce( &InitOnce, InitOnceCallback, nullptr, nullptr ); +# elif defined __linux__ + pthread_once( &once_control, InitOnceCallback ); +# else + std::call_once( once_flag, InitOnceCallback ); +# endif + rpmalloc_thread_initialize(); + } +}; + +struct InitTimeWrapper +{ + int64_t val; +}; + +struct ProducerWrapper +{ + tracy::moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptr; +}; + +struct ThreadHandleWrapper +{ + uint64_t val; +}; +#endif + + +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) +static inline void CpuId( uint32_t* regs, uint32_t leaf ) +{ +#if defined _WIN32 || defined __CYGWIN__ + __cpuidex( (int*)regs, leaf, 0 ); +#else + __get_cpuid( leaf, regs, regs+1, regs+2, regs+3 ); +#endif +} + +static void InitFailure( const char* msg ) +{ +#if defined _WIN32 || defined __CYGWIN__ + bool hasConsole = false; + bool reopen = false; + const auto attached = AttachConsole( ATTACH_PARENT_PROCESS ); + if( attached ) + { + hasConsole = true; + reopen = true; + } + else + { + const auto err = GetLastError(); + if( err == ERROR_ACCESS_DENIED ) + { + hasConsole = true; + } + } + if( hasConsole ) + { + fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg ); + if( reopen ) + { + freopen( "CONOUT$", "w", stderr ); + fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg ); + } + } + else + { + MessageBoxA( nullptr, msg, "Tracy Profiler initialization failure", MB_ICONSTOP ); + } +#else + fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg ); +#endif + exit( 0 ); +} + +static int64_t SetupHwTimer() +{ + uint32_t regs[4]; + CpuId( regs, 0x80000001 ); + if( !( regs[3] & ( 1 << 27 ) ) ) InitFailure( "CPU doesn't support RDTSCP instruction." ); + CpuId( regs, 0x80000007 ); + if( !( regs[3] & ( 1 << 8 ) ) ) + { + const char* noCheck = getenv( "TRACY_NO_INVARIANT_CHECK" ); + if( !noCheck || noCheck[0] != '1' ) + { + InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*." ); + } + } + + return Profiler::GetTime(); +} +#else +static int64_t SetupHwTimer() +{ + return Profiler::GetTime(); +} +#endif + +static const char* GetProcessName() +{ + const char* processName = "unknown"; +#ifdef _WIN32 + static char buf[_MAX_PATH]; + GetModuleFileNameA( nullptr, buf, _MAX_PATH ); + const char* ptr = buf; + while( *ptr != '\0' ) ptr++; + while( ptr > buf && *ptr != '\\' && *ptr != '/' ) ptr--; + if( ptr > buf ) ptr++; + processName = ptr; +#elif defined __ANDROID__ +# if __ANDROID_API__ >= 21 + auto buf = getprogname(); + if( buf ) processName = buf; +# endif +#elif defined _GNU_SOURCE || defined __CYGWIN__ + processName = program_invocation_short_name; +#elif defined __APPLE__ || defined BSD + auto buf = getprogname(); + if( buf ) processName = buf; +#endif + return processName; +} + +static uint32_t GetHex( char*& ptr, int skip ) +{ + uint32_t ret; + ptr += skip; + char* end; + if( ptr[0] == '0' && ptr[1] == 'x' ) + { + ptr += 2; + ret = strtol( ptr, &end, 16 ); + } + else + { + ret = strtol( ptr, &end, 10 ); + } + ptr = end; + return ret; +} + +static const char* GetHostInfo() +{ + static char buf[1024]; + auto ptr = buf; +#if defined _WIN32 || defined __CYGWIN__ +# ifdef UNICODE + t_RtlGetVersion RtlGetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandle( L"ntdll.dll" ), "RtlGetVersion" ); +# else + t_RtlGetVersion RtlGetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandle( "ntdll.dll" ), "RtlGetVersion" ); +# endif + + if( !RtlGetVersion ) + { +# ifdef __CYGWIN__ + ptr += sprintf( ptr, "OS: Windows (Cygwin)\n" ); +# elif defined __MINGW32__ + ptr += sprintf( ptr, "OS: Windows (MingW)\n" ); +# else + ptr += sprintf( ptr, "OS: Windows\n" ); +# endif + } + else + { + RTL_OSVERSIONINFOW ver = { sizeof( RTL_OSVERSIONINFOW ) }; + RtlGetVersion( &ver ); + +# ifdef __CYGWIN__ + ptr += sprintf( ptr, "OS: Windows %i.%i.%i (Cygwin)\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber ); +# elif defined __MINGW32__ + ptr += sprintf( ptr, "OS: Windows %i.%i.%i (MingW)\n", (int)ver.dwMajorVersion, (int)ver.dwMinorVersion, (int)ver.dwBuildNumber ); +# else + ptr += sprintf( ptr, "OS: Windows %i.%i.%i\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber ); +# endif + } +#elif defined __linux__ + struct utsname utsName; + uname( &utsName ); +# if defined __ANDROID__ + ptr += sprintf( ptr, "OS: Linux %s (Android)\n", utsName.release ); +# else + ptr += sprintf( ptr, "OS: Linux %s\n", utsName.release ); +# endif +#elif defined __APPLE__ +# if TARGET_OS_IPHONE == 1 + ptr += sprintf( ptr, "OS: Darwin (iOS)\n" ); +# elif TARGET_OS_MAC == 1 + ptr += sprintf( ptr, "OS: Darwin (OSX)\n" ); +# else + ptr += sprintf( ptr, "OS: Darwin (unknown)\n" ); +# endif +#elif defined __DragonFly__ + ptr += sprintf( ptr, "OS: BSD (DragonFly)\n" ); +#elif defined __FreeBSD__ + ptr += sprintf( ptr, "OS: BSD (FreeBSD)\n" ); +#elif defined __NetBSD__ + ptr += sprintf( ptr, "OS: BSD (NetBSD)\n" ); +#elif defined __OpenBSD__ + ptr += sprintf( ptr, "OS: BSD (OpenBSD)\n" ); +#else + ptr += sprintf( ptr, "OS: unknown\n" ); +#endif + +#if defined _MSC_VER +# if defined __clang__ + ptr += sprintf( ptr, "Compiler: MSVC clang-cl %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ ); +# else + ptr += sprintf( ptr, "Compiler: MSVC %i\n", _MSC_VER ); +# endif +#elif defined __clang__ + ptr += sprintf( ptr, "Compiler: clang %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ ); +#elif defined __GNUC__ + ptr += sprintf( ptr, "Compiler: gcc %i.%i\n", __GNUC__, __GNUC_MINOR__ ); +#else + ptr += sprintf( ptr, "Compiler: unknown\n" ); +#endif + +#if defined _WIN32 || defined __CYGWIN__ +# ifndef __CYGWIN__ + InitWinSock(); +# endif + char hostname[512]; + gethostname( hostname, 512 ); + + DWORD userSz = UNLEN+1; + char user[UNLEN+1]; + GetUserNameA( user, &userSz ); + + ptr += sprintf( ptr, "User: %s@%s\n", user, hostname ); +#else + char hostname[_POSIX_HOST_NAME_MAX]{}; + char user[_POSIX_LOGIN_NAME_MAX]{}; + + gethostname( hostname, _POSIX_HOST_NAME_MAX ); +# if defined __ANDROID__ + const auto login = getlogin(); + if( login ) + { + strcpy( user, login ); + } + else + { + memcpy( user, "(?)", 4 ); + } +# else + getlogin_r( user, _POSIX_LOGIN_NAME_MAX ); +# endif + + ptr += sprintf( ptr, "User: %s@%s\n", user, hostname ); +#endif + +#if defined __i386 || defined _M_IX86 + ptr += sprintf( ptr, "Arch: x86\n" ); +#elif defined __x86_64__ || defined _M_X64 + ptr += sprintf( ptr, "Arch: x64\n" ); +#elif defined __aarch64__ + ptr += sprintf( ptr, "Arch: ARM64\n" ); +#elif defined __ARM_ARCH + ptr += sprintf( ptr, "Arch: ARM\n" ); +#else + ptr += sprintf( ptr, "Arch: unknown\n" ); +#endif + +#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 + uint32_t regs[4]; + char cpuModel[4*4*3]; + auto modelPtr = cpuModel; + for( uint32_t i=0x80000002; i<0x80000005; ++i ) + { +# if defined _WIN32 || defined __CYGWIN__ + __cpuidex( (int*)regs, i, 0 ); +# else + int zero = 0; + asm volatile ( "cpuid" : "=a" (regs[0]), "=b" (regs[1]), "=c" (regs[2]), "=d" (regs[3]) : "a" (i), "c" (zero) ); +# endif + memcpy( modelPtr, regs, sizeof( regs ) ); modelPtr += sizeof( regs ); + } + + ptr += sprintf( ptr, "CPU: %s\n", cpuModel ); +#elif defined __linux__ && defined __ARM_ARCH + bool cpuFound = false; + FILE* fcpuinfo = fopen( "/proc/cpuinfo", "rb" ); + if( fcpuinfo ) + { + enum { BufSize = 4*1024 }; + char buf[BufSize]; + const auto sz = fread( buf, 1, BufSize, fcpuinfo ); + fclose( fcpuinfo ); + const auto end = buf + sz; + auto cptr = buf; + + uint32_t impl = 0; + uint32_t var = 0; + uint32_t part = 0; + uint32_t rev = 0; + + while( end - cptr > 20 ) + { + while( end - cptr > 20 && memcmp( cptr, "CPU ", 4 ) != 0 ) + { + cptr += 4; + while( end - cptr > 20 && *cptr != '\n' ) cptr++; + cptr++; + } + if( end - cptr <= 20 ) break; + cptr += 4; + if( memcmp( cptr, "implementer\t: ", 14 ) == 0 ) + { + if( impl != 0 ) break; + impl = GetHex( cptr, 14 ); + } + else if( memcmp( cptr, "variant\t: ", 10 ) == 0 ) var = GetHex( cptr, 10 ); + else if( memcmp( cptr, "part\t: ", 7 ) == 0 ) part = GetHex( cptr, 7 ); + else if( memcmp( cptr, "revision\t: ", 11 ) == 0 ) rev = GetHex( cptr, 11 ); + while( *cptr != '\n' && *cptr != '\0' ) cptr++; + cptr++; + } + + if( impl != 0 || var != 0 || part != 0 || rev != 0 ) + { + cpuFound = true; + ptr += sprintf( ptr, "CPU: %s%s r%ip%i\n", DecodeArmImplementer( impl ), DecodeArmPart( impl, part ), var, rev ); + } + } + if( !cpuFound ) + { + ptr += sprintf( ptr, "CPU: unknown\n" ); + } +#elif defined __APPLE__ && TARGET_OS_IPHONE == 1 + { + size_t sz; + sysctlbyname( "hw.machine", nullptr, &sz, nullptr, 0 ); + auto str = (char*)tracy_malloc( sz ); + sysctlbyname( "hw.machine", str, &sz, nullptr, 0 ); + ptr += sprintf( ptr, "Device: %s\n", DecodeIosDevice( str ) ); + tracy_free( str ); + } +#else + ptr += sprintf( ptr, "CPU: unknown\n" ); +#endif + + ptr += sprintf( ptr, "CPU cores: %i\n", std::thread::hardware_concurrency() ); + +#if defined _WIN32 || defined __CYGWIN__ + MEMORYSTATUSEX statex; + statex.dwLength = sizeof( statex ); + GlobalMemoryStatusEx( &statex ); +# ifdef _MSC_VER + ptr += sprintf( ptr, "RAM: %I64u MB\n", statex.ullTotalPhys / 1024 / 1024 ); +# else + ptr += sprintf( ptr, "RAM: %llu MB\n", statex.ullTotalPhys / 1024 / 1024 ); +# endif +#elif defined __linux__ + struct sysinfo sysInfo; + sysinfo( &sysInfo ); + ptr += sprintf( ptr, "RAM: %lu MB\n", sysInfo.totalram / 1024 / 1024 ); +#elif defined __APPLE__ + size_t memSize; + size_t sz = sizeof( memSize ); + sysctlbyname( "hw.memsize", &memSize, &sz, nullptr, 0 ); + ptr += sprintf( ptr, "RAM: %zu MB\n", memSize / 1024 / 1024 ); +#elif defined BSD + size_t memSize; + size_t sz = sizeof( memSize ); + sysctlbyname( "hw.physmem", &memSize, &sz, nullptr, 0 ); + ptr += sprintf( ptr, "RAM: %zu MB\n", memSize / 1024 / 1024 ); +#else + ptr += sprintf( ptr, "RAM: unknown\n" ); +#endif + + return buf; +} + +static uint64_t GetPid() +{ +#if defined _WIN32 || defined __CYGWIN__ + return uint64_t( GetCurrentProcessId() ); +#else + return uint64_t( getpid() ); +#endif +} + +static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, int& len ) +{ + static BroadcastMessage msg; + + msg.broadcastVersion = BroadcastVersion; + msg.protocolVersion = ProtocolVersion; + + memcpy( msg.programName, procname, pnsz ); + memset( msg.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz ); + + len = int( offsetof( BroadcastMessage, programName ) + pnsz + 1 ); + return msg; +} + +#if defined _WIN32 || defined __CYGWIN__ +static DWORD s_profilerThreadId = 0; +static char s_crashText[1024]; + +LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp ) +{ + const unsigned ec = pExp->ExceptionRecord->ExceptionCode; + auto msgPtr = s_crashText; + switch( ec ) + { + case EXCEPTION_ACCESS_VIOLATION: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ACCESS_VIOLATION (0x%x). ", ec ); + switch( pExp->ExceptionRecord->ExceptionInformation[0] ) + { + case 0: + msgPtr += sprintf( msgPtr, "Read violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] ); + break; + case 1: + msgPtr += sprintf( msgPtr, "Write violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] ); + break; + case 8: + msgPtr += sprintf( msgPtr, "DEP violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] ); + break; + default: + break; + } + break; + case EXCEPTION_ARRAY_BOUNDS_EXCEEDED: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ARRAY_BOUNDS_EXCEEDED (0x%x). ", ec ); + break; + case EXCEPTION_DATATYPE_MISALIGNMENT: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_DATATYPE_MISALIGNMENT (0x%x). ", ec ); + break; + case EXCEPTION_FLT_DIVIDE_BY_ZERO: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_FLT_DIVIDE_BY_ZERO (0x%x). ", ec ); + break; + case EXCEPTION_ILLEGAL_INSTRUCTION: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ILLEGAL_INSTRUCTION (0x%x). ", ec ); + break; + case EXCEPTION_IN_PAGE_ERROR: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_IN_PAGE_ERROR (0x%x). ", ec ); + break; + case EXCEPTION_INT_DIVIDE_BY_ZERO: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_INT_DIVIDE_BY_ZERO (0x%x). ", ec ); + break; + case EXCEPTION_PRIV_INSTRUCTION: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_PRIV_INSTRUCTION (0x%x). ", ec ); + break; + case EXCEPTION_STACK_OVERFLOW: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_STACK_OVERFLOW (0x%x). ", ec ); + break; + default: + return EXCEPTION_CONTINUE_SEARCH; + } + + { + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::CrashReport ); + item->crashReport.time = Profiler::GetTime(); + item->crashReport.text = (uint64_t)s_crashText; + tail.store( magic + 1, std::memory_order_release ); + + GetProfiler().SendCallstack( 60, "KiUserExceptionDispatcher" ); + } + + HANDLE h = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 ); + if( h == INVALID_HANDLE_VALUE ) return EXCEPTION_CONTINUE_SEARCH; + + THREADENTRY32 te = { sizeof( te ) }; + if( !Thread32First( h, &te ) ) + { + CloseHandle( h ); + return EXCEPTION_CONTINUE_SEARCH; + } + + const auto pid = GetCurrentProcessId(); + const auto tid = GetCurrentThreadId(); + + do + { + if( te.th32OwnerProcessID == pid && te.th32ThreadID != tid && te.th32ThreadID != s_profilerThreadId ) + { + HANDLE th = OpenThread( THREAD_SUSPEND_RESUME, FALSE, te.th32ThreadID ); + if( th != INVALID_HANDLE_VALUE ) + { + SuspendThread( th ); + CloseHandle( th ); + } + } + } + while( Thread32Next( h, &te ) ); + CloseHandle( h ); + + { + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::Crash ); + tail.store( magic + 1, std::memory_order_release ); + } + + std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) ); + GetProfiler().RequestShutdown(); + while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); }; + + TerminateProcess( GetCurrentProcess(), 1 ); + + return EXCEPTION_CONTINUE_SEARCH; +} +#endif + +#ifdef __linux__ +static long s_profilerTid = 0; +static char s_crashText[1024]; +static std::atomic<bool> s_alreadyCrashed( false ); + +static void ThreadFreezer( int /*signal*/ ) +{ + for(;;) sleep( 1000 ); +} + +static inline void HexPrint( char*& ptr, uint64_t val ) +{ + if( val == 0 ) + { + *ptr++ = '0'; + return; + } + + static const char HexTable[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; + char buf[16]; + auto bptr = buf; + + do + { + *bptr++ = HexTable[val%16]; + val /= 16; + } + while( val > 0 ); + + do + { + *ptr++ = *--bptr; + } + while( bptr != buf ); +} + +static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ ) +{ + bool expected = false; + if( !s_alreadyCrashed.compare_exchange_strong( expected, true ) ) ThreadFreezer( signal ); + + auto msgPtr = s_crashText; + switch( signal ) + { + case SIGILL: + strcpy( msgPtr, "Illegal Instruction.\n" ); + while( *msgPtr ) msgPtr++; + switch( info->si_code ) + { + case ILL_ILLOPC: + strcpy( msgPtr, "Illegal opcode.\n" ); + break; + case ILL_ILLOPN: + strcpy( msgPtr, "Illegal operand.\n" ); + break; + case ILL_ILLADR: + strcpy( msgPtr, "Illegal addressing mode.\n" ); + break; + case ILL_ILLTRP: + strcpy( msgPtr, "Illegal trap.\n" ); + break; + case ILL_PRVOPC: + strcpy( msgPtr, "Privileged opcode.\n" ); + break; + case ILL_PRVREG: + strcpy( msgPtr, "Privileged register.\n" ); + break; + case ILL_COPROC: + strcpy( msgPtr, "Coprocessor error.\n" ); + break; + case ILL_BADSTK: + strcpy( msgPtr, "Internal stack error.\n" ); + break; + default: + break; + } + break; + case SIGFPE: + strcpy( msgPtr, "Floating-point exception.\n" ); + while( *msgPtr ) msgPtr++; + switch( info->si_code ) + { + case FPE_INTDIV: + strcpy( msgPtr, "Integer divide by zero.\n" ); + break; + case FPE_INTOVF: + strcpy( msgPtr, "Integer overflow.\n" ); + break; + case FPE_FLTDIV: + strcpy( msgPtr, "Floating-point divide by zero.\n" ); + break; + case FPE_FLTOVF: + strcpy( msgPtr, "Floating-point overflow.\n" ); + break; + case FPE_FLTUND: + strcpy( msgPtr, "Floating-point underflow.\n" ); + break; + case FPE_FLTRES: + strcpy( msgPtr, "Floating-point inexact result.\n" ); + break; + case FPE_FLTINV: + strcpy( msgPtr, "Floating-point invalid operation.\n" ); + break; + case FPE_FLTSUB: + strcpy( msgPtr, "Subscript out of range.\n" ); + break; + default: + break; + } + break; + case SIGSEGV: + strcpy( msgPtr, "Invalid memory reference.\n" ); + while( *msgPtr ) msgPtr++; + switch( info->si_code ) + { + case SEGV_MAPERR: + strcpy( msgPtr, "Address not mapped to object.\n" ); + break; + case SEGV_ACCERR: + strcpy( msgPtr, "Invalid permissions for mapped object.\n" ); + break; +# ifdef SEGV_BNDERR + case SEGV_BNDERR: + strcpy( msgPtr, "Failed address bound checks.\n" ); + break; +# endif +# ifdef SEGV_PKUERR + case SEGV_PKUERR: + strcpy( msgPtr, "Access was denied by memory protection keys.\n" ); + break; +# endif + default: + break; + } + break; + case SIGPIPE: + strcpy( msgPtr, "Broken pipe.\n" ); + while( *msgPtr ) msgPtr++; + break; + case SIGBUS: + strcpy( msgPtr, "Bus error.\n" ); + while( *msgPtr ) msgPtr++; + switch( info->si_code ) + { + case BUS_ADRALN: + strcpy( msgPtr, "Invalid address alignment.\n" ); + break; + case BUS_ADRERR: + strcpy( msgPtr, "Nonexistent physical address.\n" ); + break; + case BUS_OBJERR: + strcpy( msgPtr, "Object-specific hardware error.\n" ); + break; + case BUS_MCEERR_AR: + strcpy( msgPtr, "Hardware memory error consumed on a machine check; action required.\n" ); + break; + case BUS_MCEERR_AO: + strcpy( msgPtr, "Hardware memory error detected in process but not consumed; action optional.\n" ); + break; + default: + break; + } + break; + default: + abort(); + } + while( *msgPtr ) msgPtr++; + + if( signal != SIGPIPE ) + { + strcpy( msgPtr, "Fault address: 0x" ); + while( *msgPtr ) msgPtr++; + HexPrint( msgPtr, uint64_t( info->si_addr ) ); + *msgPtr++ = '\n'; + } + + { + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::CrashReport ); + item->crashReport.time = Profiler::GetTime(); + item->crashReport.text = (uint64_t)s_crashText; + tail.store( magic + 1, std::memory_order_release ); + + GetProfiler().SendCallstack( 60, "__kernel_rt_sigreturn" ); + } + + DIR* dp = opendir( "/proc/self/task" ); + if( !dp ) abort(); + + const auto selfTid = syscall( SYS_gettid ); + + struct dirent* ep; + while( ( ep = readdir( dp ) ) != nullptr ) + { + if( ep->d_name[0] == '.' ) continue; + int tid = atoi( ep->d_name ); + if( tid != selfTid && tid != s_profilerTid ) + { + syscall( SYS_tkill, tid, SIGPWR ); + } + } + closedir( dp ); + + { + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::Crash ); + tail.store( magic + 1, std::memory_order_release ); + } + + std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) ); + GetProfiler().RequestShutdown(); + while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); }; + + abort(); +} +#endif + + +enum { QueuePrealloc = 256 * 1024 }; + +static Profiler* s_instance; +static Thread* s_thread; +static Thread* s_compressThread; + +#ifdef TRACY_HAS_SYSTEM_TRACING +static Thread* s_sysTraceThread = nullptr; +#endif + +#ifdef TRACY_DELAYED_INIT +struct ThreadNameData; +TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue(); + +struct RPMallocInit { RPMallocInit() { rpmalloc_initialize(); } }; + +TRACY_API void InitRPMallocThread() +{ + rpmalloc_initialize(); + rpmalloc_thread_initialize(); +} + +struct ProfilerData +{ + int64_t initTime = SetupHwTimer(); + RPMallocInit rpmalloc_init; + moodycamel::ConcurrentQueue<QueueItem> queue; + Profiler profiler; + std::atomic<uint32_t> lockCounter { 0 }; + std::atomic<uint8_t> gpuCtxCounter { 0 }; + std::atomic<ThreadNameData*> threadNameData { nullptr }; +}; + +struct ProducerWrapper +{ + ProducerWrapper( ProfilerData& data ) : detail( data.queue ), ptr( data.queue.get_explicit_producer( detail ) ) {} + moodycamel::ProducerToken detail; + tracy::moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptr; +}; + +struct ProfilerThreadData +{ + ProfilerThreadData( ProfilerData& data ) : token( data ), gpuCtx( { nullptr } ) {} + RPMallocInit rpmalloc_init; + ProducerWrapper token; + GpuCtxWrapper gpuCtx; +# ifdef TRACY_ON_DEMAND + LuaZoneState luaZoneState; +# endif +}; + +static std::atomic<int> profilerDataLock { 0 }; +static std::atomic<ProfilerData*> profilerData { nullptr }; + +static ProfilerData& GetProfilerData() +{ + auto ptr = profilerData.load( std::memory_order_acquire ); + if( !ptr ) + { + int expected = 0; + while( !profilerDataLock.compare_exchange_strong( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; } + ptr = profilerData.load( std::memory_order_acquire ); + if( !ptr ) + { + ptr = (ProfilerData*)malloc( sizeof( ProfilerData ) ); + new (ptr) ProfilerData(); + profilerData.store( ptr, std::memory_order_release ); + } + profilerDataLock.store( 0, std::memory_order_release ); + } + return *ptr; +} + +static ProfilerThreadData& GetProfilerThreadData() +{ + thread_local ProfilerThreadData data( GetProfilerData() ); + return data; +} + +TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken() { return GetProfilerThreadData().token.ptr; } +TRACY_API Profiler& GetProfiler() { return GetProfilerData().profiler; } +TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue() { return GetProfilerData().queue; } +TRACY_API int64_t GetInitTime() { return GetProfilerData().initTime; } +TRACY_API std::atomic<uint32_t>& GetLockCounter() { return GetProfilerData().lockCounter; } +TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter() { return GetProfilerData().gpuCtxCounter; } +TRACY_API GpuCtxWrapper& GetGpuCtx() { return GetProfilerThreadData().gpuCtx; } +TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); } +TRACY_API std::atomic<ThreadNameData*>& GetThreadNameData() { return GetProfilerData().threadNameData; } + +# ifdef TRACY_ON_DEMAND +TRACY_API LuaZoneState& GetLuaZoneState() { return GetProfilerThreadData().luaZoneState; } +# endif + +#else +TRACY_API void InitRPMallocThread() +{ + rpmalloc_thread_initialize(); +} + +// MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this. + +// 1a. But s_queue is needed for initialization of variables in point 2. +extern moodycamel::ConcurrentQueue<QueueItem> s_queue; + +thread_local RPMallocInit init_order(106) s_rpmalloc_thread_init; + +// 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread. +thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue ); +thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) }; +thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThreadHandleImpl() }; + +# ifdef _MSC_VER +// 1. Initialize these static variables before all other variables. +# pragma warning( disable : 4075 ) +# pragma init_seg( ".CRT$XCB" ) +# endif + +static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() }; +static RPMallocInit init_order(102) s_rpmalloc_init; +moodycamel::ConcurrentQueue<QueueItem> init_order(103) s_queue( QueuePrealloc ); +std::atomic<uint32_t> init_order(104) s_lockCounter( 0 ); +std::atomic<uint8_t> init_order(104) s_gpuCtxCounter( 0 ); + +thread_local GpuCtxWrapper init_order(104) s_gpuCtx { nullptr }; + +struct ThreadNameData; +static std::atomic<ThreadNameData*> init_order(104) s_threadNameDataInstance( nullptr ); +std::atomic<ThreadNameData*>& s_threadNameData = s_threadNameDataInstance; + +# ifdef TRACY_ON_DEMAND +thread_local LuaZoneState init_order(104) s_luaZoneState { 0, false }; +# endif + +static Profiler init_order(105) s_profiler; + +TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken() { return s_token.ptr; } +TRACY_API Profiler& GetProfiler() { return s_profiler; } +TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue() { return s_queue; } +TRACY_API int64_t GetInitTime() { return s_initTime.val; } +TRACY_API std::atomic<uint32_t>& GetLockCounter() { return s_lockCounter; } +TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter() { return s_gpuCtxCounter; } +TRACY_API GpuCtxWrapper& GetGpuCtx() { return s_gpuCtx; } +# ifdef __CYGWIN__ +// Hackfix for cygwin reporting memory frees without matching allocations. WTF? +TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); } +# else +TRACY_API uint64_t GetThreadHandle() { return s_threadHandle.val; } +# endif + +TRACY_API std::atomic<ThreadNameData*>& GetThreadNameData() { return s_threadNameData; } + +# ifdef TRACY_ON_DEMAND +TRACY_API LuaZoneState& GetLuaZoneState() { return s_luaZoneState; } +# endif +#endif + +enum { BulkSize = TargetFrameSize / QueueItemSize }; + +Profiler::Profiler() + : m_timeBegin( 0 ) + , m_mainThread( detail::GetThreadHandleImpl() ) + , m_epoch( std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count() ) + , m_shutdown( false ) + , m_shutdownManual( false ) + , m_shutdownFinished( false ) + , m_sock( nullptr ) + , m_broadcast( nullptr ) + , m_noExit( false ) + , m_zoneId( 1 ) + , m_stream( LZ4_createStream() ) + , m_buffer( (char*)tracy_malloc( TargetFrameSize*3 ) ) + , m_bufferOffset( 0 ) + , m_bufferStart( 0 ) + , m_itemBuf( (QueueItem*)tracy_malloc( sizeof( QueueItem ) * BulkSize ) ) + , m_lz4Buf( (char*)tracy_malloc( LZ4Size + sizeof( lz4sz_t ) ) ) + , m_serialQueue( 1024*1024 ) + , m_serialDequeue( 1024*1024 ) + , m_fiQueue( 16 ) + , m_fiDequeue( 16 ) + , m_frameCount( 0 ) +#ifdef TRACY_ON_DEMAND + , m_isConnected( false ) + , m_connectionId( 0 ) + , m_deferredQueue( 64*1024 ) +#endif + , m_paramCallback( nullptr ) +{ + assert( !s_instance ); + s_instance = this; + +#ifndef TRACY_DELAYED_INIT +# ifdef _MSC_VER + // 3. But these variables need to be initialized in main thread within the .CRT$XCB section. Do it here. + s_token_detail = moodycamel::ProducerToken( s_queue ); + s_token = ProducerWrapper { s_queue.get_explicit_producer( s_token_detail ) }; + s_threadHandle = ThreadHandleWrapper { m_mainThread }; +# endif +#endif + + CalibrateTimer(); + CalibrateDelay(); + +#ifndef TRACY_NO_EXIT + const char* noExitEnv = getenv( "TRACY_NO_EXIT" ); + if( noExitEnv && noExitEnv[0] == '1' ) + { + m_noExit = true; + } +#endif + + s_thread = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_thread) Thread( LaunchWorker, this ); + + s_compressThread = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_compressThread) Thread( LaunchCompressWorker, this ); + +#ifdef TRACY_HAS_SYSTEM_TRACING + if( SysTraceStart() ) + { + s_sysTraceThread = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_sysTraceThread) Thread( SysTraceWorker, nullptr ); + } +#endif + +#if defined _WIN32 || defined __CYGWIN__ + s_profilerThreadId = GetThreadId( s_thread->Handle() ); + AddVectoredExceptionHandler( 1, CrashFilter ); +#endif + +#ifdef __linux__ + struct sigaction threadFreezer = {}; + threadFreezer.sa_handler = ThreadFreezer; + sigaction( SIGPWR, &threadFreezer, nullptr ); + + struct sigaction crashHandler = {}; + crashHandler.sa_sigaction = CrashHandler; + crashHandler.sa_flags = SA_SIGINFO; + sigaction( SIGILL, &crashHandler, nullptr ); + sigaction( SIGFPE, &crashHandler, nullptr ); + sigaction( SIGSEGV, &crashHandler, nullptr ); + sigaction( SIGPIPE, &crashHandler, nullptr ); + sigaction( SIGBUS, &crashHandler, nullptr ); +#endif + +#ifdef TRACY_HAS_CALLSTACK + InitCallstack(); +#endif + + m_timeBegin.store( GetTime(), std::memory_order_relaxed ); +} + +Profiler::~Profiler() +{ + m_shutdown.store( true, std::memory_order_relaxed ); + +#ifdef TRACY_HAS_SYSTEM_TRACING + if( s_sysTraceThread ) + { + SysTraceStop(); + s_sysTraceThread->~Thread(); + tracy_free( s_sysTraceThread ); + } +#endif + + s_compressThread->~Thread(); + tracy_free( s_compressThread ); + s_thread->~Thread(); + tracy_free( s_thread ); + + tracy_free( m_lz4Buf ); + tracy_free( m_itemBuf ); + tracy_free( m_buffer ); + LZ4_freeStream( (LZ4_stream_t*)m_stream ); + + if( m_sock ) + { + m_sock->~Socket(); + tracy_free( m_sock ); + } + + if( m_broadcast ) + { + m_broadcast->~UdpBroadcast(); + tracy_free( m_broadcast ); + } + + assert( s_instance ); + s_instance = nullptr; +} + +bool Profiler::ShouldExit() +{ + return s_instance->m_shutdown.load( std::memory_order_relaxed ); +} + +void Profiler::Worker() +{ +#ifdef __linux__ + s_profilerTid = syscall( SYS_gettid ); +#endif + + SetThreadName( "Tracy Profiler" ); + +#ifdef TRACY_PORT + const auto port = TRACY_PORT; +#else + const auto port = 8086; +#endif + + while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + + rpmalloc_thread_initialize(); + + const auto procname = GetProcessName(); + const auto pnsz = std::min<size_t>( strlen( procname ), WelcomeMessageProgramNameSize - 1 ); + + const auto hostinfo = GetHostInfo(); + const auto hisz = std::min<size_t>( strlen( hostinfo ), WelcomeMessageHostInfoSize - 1 ); + + const uint64_t pid = GetPid(); + +#ifdef TRACY_ON_DEMAND + uint8_t onDemand = 1; +#else + uint8_t onDemand = 0; +#endif + +#ifdef __APPLE__ + uint8_t isApple = 1; +#else + uint8_t isApple = 0; +#endif + + WelcomeMessage welcome; + MemWrite( &welcome.timerMul, m_timerMul ); + MemWrite( &welcome.initBegin, GetInitTime() ); + MemWrite( &welcome.initEnd, m_timeBegin.load( std::memory_order_relaxed ) ); + MemWrite( &welcome.delay, m_delay ); + MemWrite( &welcome.resolution, m_resolution ); + MemWrite( &welcome.epoch, m_epoch ); + MemWrite( &welcome.pid, pid ); + MemWrite( &welcome.onDemand, onDemand ); + MemWrite( &welcome.isApple, isApple ); + memcpy( welcome.programName, procname, pnsz ); + memset( welcome.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz ); + memcpy( welcome.hostInfo, hostinfo, hisz ); + memset( welcome.hostInfo + hisz, 0, WelcomeMessageHostInfoSize - hisz ); + + moodycamel::ConsumerToken token( GetQueue() ); + + ListenSocket listen; + if( !listen.Listen( port, 8 ) ) + { + for(;;) + { + if( ShouldExit() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + + ClearQueues( token ); + } + } + +#ifndef TRACY_NO_BROADCAST + m_broadcast = (UdpBroadcast*)tracy_malloc( sizeof( UdpBroadcast ) ); + new(m_broadcast) UdpBroadcast(); + if( !m_broadcast->Open( "255.255.255.255", port ) ) + { + m_broadcast->~UdpBroadcast(); + tracy_free( m_broadcast ); + m_broadcast = nullptr; + } +#endif + + int broadcastLen = 0; + auto& broadcastMsg = GetBroadcastMessage( procname, pnsz, broadcastLen ); + uint64_t lastBroadcast = 0; + + // Connections loop. + // Each iteration of the loop handles whole connection. Multiple iterations will only + // happen in the on-demand mode or when handshake fails. + for(;;) + { + // Wait for incoming connection + for(;;) + { +#ifndef TRACY_NO_EXIT + if( !m_noExit && ShouldExit() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } +#endif + m_sock = listen.Accept(); + if( m_sock ) break; +#ifndef TRACY_ON_DEMAND + ProcessSysTime(); +#endif + + if( m_broadcast ) + { + const auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + if( t - lastBroadcast > 3000000000 ) // 3s + { + lastBroadcast = t; + const auto ts = std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count(); + broadcastMsg.activeTime = uint32_t( ts - m_epoch ); + m_broadcast->Send( port, &broadcastMsg, broadcastLen ); + } + } + } + + // Handshake + { + char shibboleth[HandshakeShibbolethSize]; + auto res = m_sock->ReadRaw( shibboleth, HandshakeShibbolethSize, 2000 ); + if( !res || memcmp( shibboleth, HandshakeShibboleth, HandshakeShibbolethSize ) != 0 ) + { + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + continue; + } + + uint32_t protocolVersion; + res = m_sock->ReadRaw( &protocolVersion, sizeof( protocolVersion ), 2000 ); + if( !res ) + { + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + continue; + } + + if( protocolVersion != ProtocolVersion ) + { + HandshakeStatus status = HandshakeProtocolMismatch; + m_sock->Send( &status, sizeof( status ) ); + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + continue; + } + } + +#ifdef TRACY_ON_DEMAND + const auto currentTime = GetTime(); + ClearQueues( token ); + m_connectionId.fetch_add( 1, std::memory_order_release ); + m_isConnected.store( true, std::memory_order_release ); +#endif + + HandshakeStatus handshake = HandshakeWelcome; + m_sock->Send( &handshake, sizeof( handshake ) ); + + LZ4_resetStream( (LZ4_stream_t*)m_stream ); + m_sock->Send( &welcome, sizeof( welcome ) ); + + m_threadCtx = 0; + m_refTimeSerial = 0; + m_refTimeCtx = 0; + m_refTimeGpu = 0; + +#ifdef TRACY_ON_DEMAND + OnDemandPayloadMessage onDemand; + onDemand.frames = m_frameCount.load( std::memory_order_relaxed ); + onDemand.currentTime = currentTime; + + m_sock->Send( &onDemand, sizeof( onDemand ) ); + + m_deferredLock.lock(); + for( auto& item : m_deferredQueue ) + { + const auto idx = MemRead<uint8_t>( &item.hdr.idx ); + if( (QueueType)idx == QueueType::MessageAppInfo ) + { + uint64_t ptr = MemRead<uint64_t>( &item.message.text ); + SendString( ptr, (const char*)ptr, QueueType::CustomStringData ); + } + AppendData( &item, QueueDataSize[idx] ); + } + m_deferredLock.unlock(); +#endif + + // Main communications loop + int keepAlive = 0; + for(;;) + { + ProcessSysTime(); + const auto status = Dequeue( token ); + const auto serialStatus = DequeueSerial(); + if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost ) + { + break; + } + else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty ) + { + if( ShouldExit() ) break; + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) break; + } + if( keepAlive == 500 ) + { + QueueItem ka; + ka.hdr.type = QueueType::KeepAlive; + AppendData( &ka, QueueDataSize[ka.hdr.idx] ); + if( !CommitData() ) break; + + keepAlive = 0; + } + else + { + keepAlive++; + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } + else + { + keepAlive = 0; + } + + bool connActive = true; + while( m_sock->HasData() && connActive ) + { + connActive = HandleServerQuery(); + } + if( !connActive ) break; + } + if( ShouldExit() ) break; + +#ifdef TRACY_ON_DEMAND + m_isConnected.store( false, std::memory_order_release ); + m_bufferOffset = 0; + m_bufferStart = 0; +#endif + + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + +#ifndef TRACY_ON_DEMAND + // Client is no longer available here. Accept incoming connections, but reject handshake. + for(;;) + { + if( ShouldExit() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + + ClearQueues( token ); + + m_sock = listen.Accept(); + if( m_sock ) + { + char shibboleth[HandshakeShibbolethSize]; + auto res = m_sock->ReadRaw( shibboleth, HandshakeShibbolethSize, 1000 ); + if( !res || memcmp( shibboleth, HandshakeShibboleth, HandshakeShibbolethSize ) != 0 ) + { + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + continue; + } + + uint32_t protocolVersion; + res = m_sock->ReadRaw( &protocolVersion, sizeof( protocolVersion ), 1000 ); + if( !res ) + { + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + continue; + } + + HandshakeStatus status = HandshakeNotAvailable; + m_sock->Send( &status, sizeof( status ) ); + m_sock->~Socket(); + tracy_free( m_sock ); + } + } +#endif + } + // End of connections loop + + // Client is exiting. Send items remaining in queues. + for(;;) + { + const auto status = Dequeue( token ); + const auto serialStatus = DequeueSerial(); + if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty ) + { + if( m_bufferOffset != m_bufferStart ) CommitData(); + break; + } + + while( m_sock->HasData() ) + { + if( !HandleServerQuery() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + } + } + + // Send client termination notice to the server + QueueItem terminate; + MemWrite( &terminate.hdr.type, QueueType::Terminate ); + if( !SendData( (const char*)&terminate, 1 ) ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + // Handle remaining server queries + { // XXX diesel changes + if( m_bufferOffset != m_bufferStart ) CommitData(); + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + for(;;) + { + if( m_sock->HasData() ) + { + while( m_sock->HasData() ) + { + if( !HandleServerQuery() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + } + while( Dequeue( token ) == DequeueStatus::DataDequeued ) {} + while( DequeueSerial() == DequeueStatus::DataDequeued ) {} + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + } + } + else + { + // XXX diesel changes + // if( m_bufferOffset != m_bufferStart ) CommitData(); + // std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + } +} + +void Profiler::CompressWorker() +{ + SetThreadName( "Tracy DXT1" ); + while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + rpmalloc_thread_initialize(); + for(;;) + { + const auto shouldExit = ShouldExit(); + + { + bool lockHeld = true; + while( !m_fiLock.try_lock() ) + { + if( m_shutdownManual.load( std::memory_order_relaxed ) ) + { + lockHeld = false; + break; + } + } + if( !m_fiQueue.empty() ) m_fiQueue.swap( m_fiDequeue ); + if( lockHeld ) + { + m_fiLock.unlock(); + } + } + + const auto sz = m_fiDequeue.size(); + if( sz > 0 ) + { + auto fi = m_fiDequeue.data(); + auto end = fi + sz; + while( fi != end ) + { + const auto w = fi->w; + const auto h = fi->h; + const auto csz = size_t( w * h / 2 ); + auto etc1buf = (char*)tracy_malloc( csz ); + CompressImageDxt1( (const char*)fi->image, etc1buf, w, h ); + tracy_free( fi->image ); + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::FrameImage ); + MemWrite( &item->frameImage.image, (uint64_t)etc1buf ); + MemWrite( &item->frameImage.frame, fi->frame ); + MemWrite( &item->frameImage.w, w ); + MemWrite( &item->frameImage.h, h ); + uint8_t flip = fi->flip; + MemWrite( &item->frameImage.flip, flip ); + tail.store( magic + 1, std::memory_order_release ); + + fi++; + } + m_fiDequeue.clear(); + } + else + { + std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) ); + } + + if( shouldExit ) + { + return; + } + } +} + +static void FreeAssociatedMemory( const QueueItem& item ) +{ + if( item.hdr.idx >= (int)QueueType::Terminate ) return; + + uint64_t ptr; + switch( item.hdr.type ) + { + case QueueType::ZoneText: + case QueueType::ZoneName: + ptr = MemRead<uint64_t>( &item.zoneText.text ); + tracy_free( (void*)ptr ); + break; + case QueueType::Message: + case QueueType::MessageColor: + case QueueType::MessageCallstack: + case QueueType::MessageColorCallstack: +#ifndef TRACY_ON_DEMAND + case QueueType::MessageAppInfo: +#endif + ptr = MemRead<uint64_t>( &item.message.text ); + tracy_free( (void*)ptr ); + break; + case QueueType::ZoneBeginAllocSrcLoc: + case QueueType::ZoneBeginAllocSrcLocCallstack: + ptr = MemRead<uint64_t>( &item.zoneBegin.srcloc ); + tracy_free( (void*)ptr ); + break; + case QueueType::CallstackMemory: + ptr = MemRead<uint64_t>( &item.callstackMemory.ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::Callstack: + ptr = MemRead<uint64_t>( &item.callstack.ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::CallstackAlloc: + ptr = MemRead<uint64_t>( &item.callstackAlloc.nativePtr ); + tracy_free( (void*)ptr ); + ptr = MemRead<uint64_t>( &item.callstackAlloc.ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::FrameImage: + ptr = MemRead<uint64_t>( &item.frameImage.image ); + tracy_free( (void*)ptr ); + break; +#ifdef TRACY_ON_DEMAND + case QueueType::MessageAppInfo: + // Don't free memory associated with deferred messages. + break; +#endif + default: + break; + } +} + +void Profiler::ClearQueues( moodycamel::ConsumerToken& token ) +{ + for(;;) + { + const auto sz = GetQueue().try_dequeue_bulk( token, m_itemBuf, BulkSize ); + if( sz == 0 ) break; + for( size_t i=0; i<sz; i++ ) FreeAssociatedMemory( m_itemBuf[i] ); + } + + ClearSerial(); +} + +void Profiler::ClearSerial() +{ + bool lockHeld = true; + while( !m_serialLock.try_lock() ) + { + if( m_shutdownManual.load( std::memory_order_relaxed ) ) + { + lockHeld = false; + break; + } + } + for( auto& v : m_serialQueue ) FreeAssociatedMemory( v ); + m_serialQueue.clear(); + if( lockHeld ) + { + m_serialLock.unlock(); + } + + for( auto& v : m_serialDequeue ) FreeAssociatedMemory( v ); + m_serialDequeue.clear(); +} + +Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) +{ + uint64_t threadId; + const auto sz = GetQueue().try_dequeue_bulk_single( token, m_itemBuf, BulkSize, threadId ); + if( sz > 0 ) + { + if( threadId != m_threadCtx ) + { + QueueItem item; + MemWrite( &item.hdr.type, QueueType::ThreadContext ); + MemWrite( &item.threadCtx.thread, threadId ); + if( !AppendData( &item, QueueDataSize[(int)QueueType::ThreadContext] ) ) return DequeueStatus::ConnectionLost; + m_threadCtx = threadId; + m_refTimeThread = 0; + } + + auto end = m_itemBuf + sz; + auto item = m_itemBuf; + while( item != end ) + { + uint64_t ptr; + const auto idx = MemRead<uint8_t>( &item->hdr.idx ); + if( idx < (int)QueueType::Terminate ) + { + switch( (QueueType)idx ) + { + case QueueType::ZoneText: + case QueueType::ZoneName: + ptr = MemRead<uint64_t>( &item->zoneText.text ); + SendString( ptr, (const char*)ptr, QueueType::CustomStringData ); + tracy_free( (void*)ptr ); + break; + case QueueType::Message: + case QueueType::MessageColor: + case QueueType::MessageCallstack: + case QueueType::MessageColorCallstack: + ptr = MemRead<uint64_t>( &item->message.text ); + SendString( ptr, (const char*)ptr, QueueType::CustomStringData ); + tracy_free( (void*)ptr ); + break; + case QueueType::MessageAppInfo: + ptr = MemRead<uint64_t>( &item->message.text ); + SendString( ptr, (const char*)ptr, QueueType::CustomStringData ); +#ifndef TRACY_ON_DEMAND + tracy_free( (void*)ptr ); +#endif + break; + case QueueType::ZoneBeginAllocSrcLoc: + case QueueType::ZoneBeginAllocSrcLocCallstack: + { + int64_t t = MemRead<int64_t>( &item->zoneBegin.time ); + int64_t dt = t - m_refTimeThread; + m_refTimeThread = t; + MemWrite( &item->zoneBegin.time, dt ); + ptr = MemRead<uint64_t>( &item->zoneBegin.srcloc ); + SendSourceLocationPayload( ptr ); + tracy_free( (void*)ptr ); + break; + } + case QueueType::Callstack: + ptr = MemRead<uint64_t>( &item->callstack.ptr ); + SendCallstackPayload( ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::CallstackAlloc: + ptr = MemRead<uint64_t>( &item->callstackAlloc.nativePtr ); + if( ptr != 0 ) + { + CutCallstack( (void*)ptr, "lua_pcall" ); + SendCallstackPayload( ptr ); + tracy_free( (void*)ptr ); + } + ptr = MemRead<uint64_t>( &item->callstackAlloc.ptr ); + SendCallstackAlloc( ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::FrameImage: + { + ptr = MemRead<uint64_t>( &item->frameImage.image ); + const auto w = MemRead<uint16_t>( &item->frameImage.w ); + const auto h = MemRead<uint16_t>( &item->frameImage.h ); + const auto csz = size_t( w * h / 2 ); + SendLongString( ptr, (const char*)ptr, csz, QueueType::FrameImageData ); + tracy_free( (void*)ptr ); + break; + } + case QueueType::ZoneBegin: + case QueueType::ZoneBeginCallstack: + { + int64_t t = MemRead<int64_t>( &item->zoneBegin.time ); + int64_t dt = t - m_refTimeThread; + m_refTimeThread = t; + MemWrite( &item->zoneBegin.time, dt ); + break; + } + case QueueType::ZoneEnd: + { + int64_t t = MemRead<int64_t>( &item->zoneEnd.time ); + int64_t dt = t - m_refTimeThread; + m_refTimeThread = t; + MemWrite( &item->zoneEnd.time, dt ); + break; + } + case QueueType::GpuZoneBegin: + case QueueType::GpuZoneBeginCallstack: + { + int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime ); + int64_t dt = t - m_refTimeThread; + m_refTimeThread = t; + MemWrite( &item->gpuZoneBegin.cpuTime, dt ); + break; + } + case QueueType::GpuZoneEnd: + { + int64_t t = MemRead<int64_t>( &item->gpuZoneEnd.cpuTime ); + int64_t dt = t - m_refTimeThread; + m_refTimeThread = t; + MemWrite( &item->gpuZoneEnd.cpuTime, dt ); + break; + } + case QueueType::PlotData: + { + int64_t t = MemRead<int64_t>( &item->plotData.time ); + int64_t dt = t - m_refTimeThread; + m_refTimeThread = t; + MemWrite( &item->plotData.time, dt ); + break; + } + case QueueType::ContextSwitch: + { + int64_t t = MemRead<int64_t>( &item->contextSwitch.time ); + int64_t dt = t - m_refTimeCtx; + m_refTimeCtx = t; + MemWrite( &item->contextSwitch.time, dt ); + break; + } + case QueueType::ThreadWakeup: + { + int64_t t = MemRead<int64_t>( &item->threadWakeup.time ); + int64_t dt = t - m_refTimeCtx; + m_refTimeCtx = t; + MemWrite( &item->threadWakeup.time, dt ); + break; + } + case QueueType::GpuTime: + { + int64_t t = MemRead<int64_t>( &item->gpuTime.gpuTime ); + int64_t dt = t - m_refTimeGpu; + m_refTimeGpu = t; + MemWrite( &item->gpuTime.gpuTime, dt ); + break; + } + default: + assert( false ); + break; + } + } + if( !AppendData( item, QueueDataSize[idx] ) ) return DequeueStatus::ConnectionLost; + item++; + } + } + else + { + return DequeueStatus::QueueEmpty; + } + return DequeueStatus::DataDequeued; +} + +Profiler::DequeueStatus Profiler::DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop ) +{ + const auto sz = GetQueue().try_dequeue_bulk( token, m_itemBuf, BulkSize ); + if( sz > 0 ) + { + auto end = m_itemBuf + sz; + auto item = m_itemBuf; + while( item != end ) + { + FreeAssociatedMemory( *item ); + const auto idx = MemRead<uint8_t>( &item->hdr.idx ); + if( idx == (uint8_t)QueueType::ContextSwitch ) + { + const auto csTime = MemRead<int64_t>( &item->contextSwitch.time ); + if( csTime > timeStop ) + { + timeStop = -1; + return DequeueStatus::DataDequeued; + } + int64_t dt = csTime - m_refTimeCtx; + m_refTimeCtx = csTime; + MemWrite( &item->contextSwitch.time, dt ); + if( !AppendData( item, QueueDataSize[(int)QueueType::ContextSwitch] ) ) return DequeueStatus::ConnectionLost; + } + else if( idx == (uint8_t)QueueType::ThreadWakeup ) + { + const auto csTime = MemRead<int64_t>( &item->threadWakeup.time ); + if( csTime > timeStop ) + { + timeStop = -1; + return DequeueStatus::DataDequeued; + } + int64_t dt = csTime - m_refTimeCtx; + m_refTimeCtx = csTime; + MemWrite( &item->threadWakeup.time, dt ); + if( !AppendData( item, QueueDataSize[(int)QueueType::ThreadWakeup] ) ) return DequeueStatus::ConnectionLost; + } + item++; + } + } + else + { + return DequeueStatus::QueueEmpty; + } + return DequeueStatus::DataDequeued; +} + +Profiler::DequeueStatus Profiler::DequeueSerial() +{ + { + bool lockHeld = true; + while( !m_serialLock.try_lock() ) + { + if( m_shutdownManual.load( std::memory_order_relaxed ) ) + { + lockHeld = false; + break; + } + } + if( !m_serialQueue.empty() ) m_serialQueue.swap( m_serialDequeue ); + if( lockHeld ) + { + m_serialLock.unlock(); + } + } + + const auto sz = m_serialDequeue.size(); + if( sz > 0 ) + { + auto item = m_serialDequeue.data(); + auto end = item + sz; + while( item != end ) + { + uint64_t ptr; + const auto idx = MemRead<uint8_t>( &item->hdr.idx ); + if( idx < (int)QueueType::Terminate ) + { + switch( (QueueType)idx ) + { + case QueueType::CallstackMemory: + ptr = MemRead<uint64_t>( &item->callstackMemory.ptr ); + SendCallstackPayload( ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::LockWait: + case QueueType::LockSharedWait: + { + int64_t t = MemRead<int64_t>( &item->lockWait.time ); + int64_t dt = t - m_refTimeSerial; + m_refTimeSerial = t; + MemWrite( &item->lockWait.time, dt ); + break; + } + case QueueType::LockObtain: + case QueueType::LockSharedObtain: + { + int64_t t = MemRead<int64_t>( &item->lockObtain.time ); + int64_t dt = t - m_refTimeSerial; + m_refTimeSerial = t; + MemWrite( &item->lockObtain.time, dt ); + break; + } + case QueueType::LockRelease: + case QueueType::LockSharedRelease: + { + int64_t t = MemRead<int64_t>( &item->lockRelease.time ); + int64_t dt = t - m_refTimeSerial; + m_refTimeSerial = t; + MemWrite( &item->lockRelease.time, dt ); + break; + } + case QueueType::MemAlloc: + case QueueType::MemAllocCallstack: + { + int64_t t = MemRead<int64_t>( &item->memAlloc.time ); + int64_t dt = t - m_refTimeSerial; + m_refTimeSerial = t; + MemWrite( &item->memAlloc.time, dt ); + break; + } + case QueueType::MemFree: + case QueueType::MemFreeCallstack: + { + int64_t t = MemRead<int64_t>( &item->memFree.time ); + int64_t dt = t - m_refTimeSerial; + m_refTimeSerial = t; + MemWrite( &item->memFree.time, dt ); + break; + } + case QueueType::GpuZoneBeginSerial: + case QueueType::GpuZoneBeginCallstackSerial: + { + int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime ); + int64_t dt = t - m_refTimeSerial; + m_refTimeSerial = t; + MemWrite( &item->gpuZoneBegin.cpuTime, dt ); + break; + } + case QueueType::GpuZoneEndSerial: + { + int64_t t = MemRead<int64_t>( &item->gpuZoneEnd.cpuTime ); + int64_t dt = t - m_refTimeSerial; + m_refTimeSerial = t; + MemWrite( &item->gpuZoneEnd.cpuTime, dt ); + break; + } + case QueueType::GpuTime: + { + int64_t t = MemRead<int64_t>( &item->gpuTime.gpuTime ); + int64_t dt = t - m_refTimeGpu; + m_refTimeGpu = t; + MemWrite( &item->gpuTime.gpuTime, dt ); + break; + } + default: + assert( false ); + break; + } + } + if( !AppendData( item, QueueDataSize[idx] ) ) return DequeueStatus::ConnectionLost; + item++; + } + m_serialDequeue.clear(); + } + else + { + return DequeueStatus::QueueEmpty; + } + return DequeueStatus::DataDequeued; +} + +bool Profiler::AppendData( const void* data, size_t len ) +{ + const auto ret = NeedDataSize( len ); + AppendDataUnsafe( data, len ); + return ret; +} + +bool Profiler::CommitData() +{ + bool ret = SendData( m_buffer + m_bufferStart, m_bufferOffset - m_bufferStart ); + if( m_bufferOffset > TargetFrameSize * 2 ) m_bufferOffset = 0; + m_bufferStart = m_bufferOffset; + return ret; +} + +bool Profiler::NeedDataSize( size_t len ) +{ + assert( len <= TargetFrameSize ); + bool ret = true; + if( m_bufferOffset - m_bufferStart + len > TargetFrameSize ) + { + ret = CommitData(); + } + return ret; +} + +bool Profiler::SendData( const char* data, size_t len ) +{ + const lz4sz_t lz4sz = LZ4_compress_fast_continue( (LZ4_stream_t*)m_stream, data, m_lz4Buf + sizeof( lz4sz_t ), (int)len, LZ4Size, 1 ); + memcpy( m_lz4Buf, &lz4sz, sizeof( lz4sz ) ); + return m_sock->Send( m_lz4Buf, lz4sz + sizeof( lz4sz_t ) ) != -1; +} + +void Profiler::SendString( uint64_t str, const char* ptr, QueueType type ) +{ + assert( type == QueueType::StringData || + type == QueueType::ThreadName || + type == QueueType::CustomStringData || + type == QueueType::PlotName || + type == QueueType::FrameName || + type == QueueType::ExternalName || + type == QueueType::ExternalThreadName ); + + QueueItem item; + MemWrite( &item.hdr.type, type ); + MemWrite( &item.stringTransfer.ptr, str ); + + auto len = strlen( ptr ); + assert( len <= std::numeric_limits<uint16_t>::max() ); + auto l16 = uint16_t( len ); + + NeedDataSize( QueueDataSize[(int)type] + sizeof( l16 ) + l16 ); + + AppendDataUnsafe( &item, QueueDataSize[(int)type] ); + AppendDataUnsafe( &l16, sizeof( l16 ) ); + AppendDataUnsafe( ptr, l16 ); +} + +void Profiler::SendLongString( uint64_t str, const char* ptr, size_t len, QueueType type ) +{ + assert( type == QueueType::FrameImageData ); + + QueueItem item; + MemWrite( &item.hdr.type, type ); + MemWrite( &item.stringTransfer.ptr, str ); + + assert( len <= std::numeric_limits<uint32_t>::max() ); + assert( QueueDataSize[(int)type] + sizeof( uint32_t ) + len <= TargetFrameSize ); + auto l32 = uint32_t( len ); + + NeedDataSize( QueueDataSize[(int)type] + sizeof( l32 ) + l32 ); + + AppendDataUnsafe( &item, QueueDataSize[(int)type] ); + AppendDataUnsafe( &l32, sizeof( l32 ) ); + AppendDataUnsafe( ptr, l32 ); +} + +void Profiler::SendSourceLocation( uint64_t ptr ) +{ + auto srcloc = (const SourceLocationData*)ptr; + QueueItem item; + MemWrite( &item.hdr.type, QueueType::SourceLocation ); + MemWrite( &item.srcloc.name, (uint64_t)srcloc->name ); + MemWrite( &item.srcloc.file, (uint64_t)srcloc->file ); + MemWrite( &item.srcloc.function, (uint64_t)srcloc->function ); + MemWrite( &item.srcloc.line, srcloc->line ); + MemWrite( &item.srcloc.r, uint8_t( ( srcloc->color ) & 0xFF ) ); + MemWrite( &item.srcloc.g, uint8_t( ( srcloc->color >> 8 ) & 0xFF ) ); + MemWrite( &item.srcloc.b, uint8_t( ( srcloc->color >> 16 ) & 0xFF ) ); + AppendData( &item, QueueDataSize[(int)QueueType::SourceLocation] ); +} + +void Profiler::SendSourceLocationPayload( uint64_t _ptr ) +{ + auto ptr = (const char*)_ptr; + + QueueItem item; + MemWrite( &item.hdr.type, QueueType::SourceLocationPayload ); + MemWrite( &item.stringTransfer.ptr, _ptr ); + + const auto len = *((uint32_t*)ptr); + assert( len <= std::numeric_limits<uint16_t>::max() ); + assert( len > 4 ); + const auto l16 = uint16_t( len - 4 ); + + NeedDataSize( QueueDataSize[(int)QueueType::SourceLocationPayload] + sizeof( l16 ) + l16 ); + + AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SourceLocationPayload] ); + AppendDataUnsafe( &l16, sizeof( l16 ) ); + AppendDataUnsafe( ptr + 4, l16 ); +} + +void Profiler::SendCallstackPayload( uint64_t _ptr ) +{ + auto ptr = (uintptr_t*)_ptr; + + QueueItem item; + MemWrite( &item.hdr.type, QueueType::CallstackPayload ); + MemWrite( &item.stringTransfer.ptr, _ptr ); + + const auto sz = *ptr++; + const auto len = sz * sizeof( uint64_t ); + const auto l16 = uint16_t( len ); + + NeedDataSize( QueueDataSize[(int)QueueType::CallstackPayload] + sizeof( l16 ) + l16 ); + + AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackPayload] ); + AppendDataUnsafe( &l16, sizeof( l16 ) ); + + if( compile_time_condition<sizeof( uintptr_t ) == sizeof( uint64_t )>::value ) + { + AppendDataUnsafe( ptr, sizeof( uint64_t ) * sz ); + } + else + { + for( uintptr_t i=0; i<sz; i++ ) + { + const auto val = uint64_t( *ptr++ ); + AppendDataUnsafe( &val, sizeof( uint64_t ) ); + } + } +} + +void Profiler::SendCallstackAlloc( uint64_t _ptr ) +{ + auto ptr = (const char*)_ptr; + + QueueItem item; + MemWrite( &item.hdr.type, QueueType::CallstackAllocPayload ); + MemWrite( &item.stringTransfer.ptr, _ptr ); + + const auto len = *((uint32_t*)ptr); + assert( len <= std::numeric_limits<uint16_t>::max() ); + const auto l16 = uint16_t( len ); + + NeedDataSize( QueueDataSize[(int)QueueType::CallstackAllocPayload] + sizeof( l16 ) + l16 ); + + AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackAllocPayload] ); + AppendDataUnsafe( &l16, sizeof( l16 ) ); + AppendDataUnsafe( ptr + 4, l16 ); +} + +void Profiler::SendCallstackFrame( uint64_t ptr ) +{ +#ifdef TRACY_HAS_CALLSTACK + const auto frameData = DecodeCallstackPtr( ptr ); + + { + QueueItem item; + MemWrite( &item.hdr.type, QueueType::CallstackFrameSize ); + MemWrite( &item.callstackFrameSize.ptr, ptr ); + MemWrite( &item.callstackFrameSize.size, frameData.size ); + + AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrameSize] ); + } + + for( uint8_t i=0; i<frameData.size; i++ ) + { + const auto& frame = frameData.data[i]; + + SendString( uint64_t( frame.name ), frame.name, QueueType::CustomStringData ); + SendString( uint64_t( frame.file ), frame.file, QueueType::CustomStringData ); + + QueueItem item; + MemWrite( &item.hdr.type, QueueType::CallstackFrame ); + MemWrite( &item.callstackFrame.name, (uint64_t)frame.name ); + MemWrite( &item.callstackFrame.file, (uint64_t)frame.file ); + MemWrite( &item.callstackFrame.line, frame.line ); + + AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrame] ); + + tracy_free( (void*)frame.name ); + tracy_free( (void*)frame.file ); + } +#endif +} + + +static bool DontExit() { return false; } + +bool Profiler::HandleServerQuery() +{ + uint8_t type; + if( !m_sock->Read( &type, sizeof( type ), 10, DontExit ) ) return false; + + uint64_t ptr; + if( !m_sock->Read( &ptr, sizeof( ptr ), 10, DontExit ) ) return false; + + switch( type ) + { + case ServerQueryString: + SendString( ptr, (const char*)ptr, QueueType::StringData ); + break; + case ServerQueryThreadString: + if( ptr == m_mainThread ) + { + SendString( ptr, "Main thread", QueueType::ThreadName ); + } + else + { + SendString( ptr, GetThreadName( ptr ), QueueType::ThreadName ); + } + break; + case ServerQuerySourceLocation: + SendSourceLocation( ptr ); + break; + case ServerQueryPlotName: + SendString( ptr, (const char*)ptr, QueueType::PlotName ); + break; + case ServerQueryTerminate: + return false; + case ServerQueryCallstackFrame: + SendCallstackFrame( ptr ); + break; + case ServerQueryFrameName: + SendString( ptr, (const char*)ptr, QueueType::FrameName ); + break; + case ServerQueryDisconnect: + HandleDisconnect(); + return false; +#ifdef TRACY_HAS_SYSTEM_TRACING + case ServerQueryExternalName: + SysTraceSendExternalName( ptr ); + break; +#endif + case ServerQueryParameter: + HandleParameter( ptr ); + break; + default: + assert( false ); + break; + } + + return true; +} + +void Profiler::HandleDisconnect() +{ + moodycamel::ConsumerToken token( GetQueue() ); + +#ifdef TRACY_HAS_SYSTEM_TRACING + if( s_sysTraceThread ) + { + auto timestamp = GetTime(); + for(;;) + { + const auto status = DequeueContextSwitches( token, timestamp ); + if( status == DequeueStatus::ConnectionLost ) + { + return; + } + else if( status == DequeueStatus::QueueEmpty ) + { + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + } + if( timestamp < 0 ) + { + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + break; + } + ClearSerial(); + if( m_sock->HasData() ) + { + while( m_sock->HasData() ) + { + if( !HandleServerQuery() ) return; + } + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + } + else + { + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } + } +#endif + + QueueItem terminate; + MemWrite( &terminate.hdr.type, QueueType::Terminate ); + if( !SendData( (const char*)&terminate, 1 ) ) return; + for(;;) + { + ClearQueues( token ); + if( m_sock->HasData() ) + { + while( m_sock->HasData() ) + { + if( !HandleServerQuery() ) return; + } + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + } + else + { + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } +} + +void Profiler::CalibrateTimer() +{ +#ifdef TRACY_HW_TIMER +# if !defined TARGET_OS_IOS && __ARM_ARCH >= 6 + m_timerMul = 1.; +# else + std::atomic_signal_fence( std::memory_order_acq_rel ); + const auto t0 = std::chrono::high_resolution_clock::now(); + const auto r0 = GetTime(); + std::atomic_signal_fence( std::memory_order_acq_rel ); + std::this_thread::sleep_for( std::chrono::milliseconds( 200 ) ); + std::atomic_signal_fence( std::memory_order_acq_rel ); + const auto t1 = std::chrono::high_resolution_clock::now(); + const auto r1 = GetTime(); + std::atomic_signal_fence( std::memory_order_acq_rel ); + + const auto dt = std::chrono::duration_cast<std::chrono::nanoseconds>( t1 - t0 ).count(); + const auto dr = r1 - r0; + + m_timerMul = double( dt ) / double( dr ); +# endif +#else + m_timerMul = 1.; +#endif +} + +void Profiler::CalibrateDelay() +{ + enum { Iterations = 50000 }; + + auto mindiff = std::numeric_limits<int64_t>::max(); + for( int i=0; i<Iterations * 10; i++ ) + { + const auto t0i = GetTime(); + const auto t1i = GetTime(); + const auto dti = t1i - t0i; + if( dti > 0 && dti < mindiff ) mindiff = dti; + } + m_resolution = mindiff; + +#ifdef TRACY_DELAYED_INIT + m_delay = m_resolution; +#else + enum { Events = Iterations * 2 }; // start + end + static_assert( Events < QueuePrealloc, "Delay calibration loop will allocate memory in queue" ); + + moodycamel::ProducerToken ptoken_detail( GetQueue() ); + moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptoken = GetQueue().get_explicit_producer( ptoken_detail ); + static const tracy::SourceLocationData __tracy_source_location { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; + const auto t0 = GetTime(); + for( int i=0; i<Iterations; i++ ) + { + { + Magic magic; + auto& tail = ptoken->get_tail_index(); + auto item = ptoken->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ZoneBegin ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, (uint64_t)&__tracy_source_location ); + tail.store( magic + 1, std::memory_order_release ); + } + { + Magic magic; + auto& tail = ptoken->get_tail_index(); + auto item = ptoken->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ZoneEnd ); + MemWrite( &item->zoneEnd.time, GetTime() ); + tail.store( magic + 1, std::memory_order_release ); + } + } + const auto t1 = GetTime(); + const auto dt = t1 - t0; + m_delay = dt / Events; + + enum { Bulk = 1000 }; + moodycamel::ConsumerToken token( GetQueue() ); + int left = Events; + QueueItem item[Bulk]; + while( left != 0 ) + { + const auto sz = GetQueue().try_dequeue_bulk( token, item, std::min( left, (int)Bulk ) ); + assert( sz > 0 ); + left -= (int)sz; + } + assert( GetQueue().size_approx() == 0 ); +#endif +} + +void Profiler::SendCallstack( int depth, const char* skipBefore ) +{ +#ifdef TRACY_HAS_CALLSTACK + auto ptr = Callstack( depth ); + CutCallstack( ptr, skipBefore ); + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::Callstack ); + MemWrite( &item->callstack.ptr, ptr ); + tail.store( magic + 1, std::memory_order_release ); +#endif +} + +void Profiler::CutCallstack( void* callstack, const char* skipBefore ) +{ +#ifdef TRACY_HAS_CALLSTACK + auto data = (uintptr_t*)callstack; + const auto sz = *data++; + uintptr_t i; + for( i=0; i<sz; i++ ) + { + auto name = DecodeCallstackPtrFast( uint64_t( data[i] ) ); + const bool found = strcmp( name, skipBefore ) == 0; + if( found ) + { + i++; + break; + } + } + + if( i != sz ) + { + memmove( data, data + i, ( sz - i ) * sizeof( uintptr_t* ) ); + *--data = sz - i; + } +#endif +} + +#ifdef TRACY_HAS_SYSTIME +void Profiler::ProcessSysTime() +{ + if( m_shutdown.load( std::memory_order_relaxed ) ) return; + auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + if( t - m_sysTimeLast > 100000000 ) // 100 ms + { + auto sysTime = m_sysTime.Get(); + if( sysTime >= 0 ) + { + m_sysTimeLast = t; + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::SysTimeReport ); + MemWrite( &item->sysTime.time, GetTime() ); + MemWrite( &item->sysTime.sysTime, sysTime ); + tail.store( magic + 1, std::memory_order_release ); + } + } +} +#endif + +void Profiler::ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val ) +{ + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ParamSetup ); + tracy::MemWrite( &item->paramSetup.idx, idx ); + tracy::MemWrite( &item->paramSetup.name, (uint64_t)name ); + tracy::MemWrite( &item->paramSetup.isBool, (uint8_t)isBool ); + tracy::MemWrite( &item->paramSetup.val, val ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + tail.store( magic + 1, std::memory_order_release ); +} + +void Profiler::HandleParameter( uint64_t payload ) +{ + assert( m_paramCallback ); + const auto idx = uint32_t( payload >> 32 ); + const auto val = int32_t( payload & 0xFFFFFFFF ); + m_paramCallback( idx, val ); +} + +} + +#ifdef __cplusplus +extern "C" { +#endif + +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int active ) +{ + ___tracy_c_zone_context ctx; +#ifdef TRACY_ON_DEMAND + ctx.active = active && tracy::GetProfiler().IsConnected(); +#else + ctx.active = active; +#endif + if( !ctx.active ) return ctx; + const auto id = tracy::GetProfiler().GetNextZoneId(); + ctx.id = id; + +#ifndef TRACY_NO_VERIFY + { + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, id ); + tail.store( magic + 1, std::memory_order_release ); + } +#endif + { + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneBegin ); + tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); + tail.store( magic + 1, std::memory_order_release ); + } + return ctx; +} + +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int depth, int active ) +{ + ___tracy_c_zone_context ctx; +#ifdef TRACY_ON_DEMAND + ctx.active = active && tracy::GetProfiler().IsConnected(); +#else + ctx.active = active; +#endif + if( !ctx.active ) return ctx; + const auto id = tracy::GetProfiler().GetNextZoneId(); + ctx.id = id; + +#ifndef TRACY_NO_VERIFY + { + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, id ); + tail.store( magic + 1, std::memory_order_release ); + } +#endif + { + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneBeginCallstack ); + tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); + tail.store( magic + 1, std::memory_order_release ); + } + + tracy::GetProfiler().SendCallstack( depth ); + return ctx; +} + +TRACY_API void ___tracy_emit_zone_end( TracyCZoneCtx ctx ) +{ + if( !ctx.active ) return; +#ifndef TRACY_NO_VERIFY + { + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, ctx.id ); + tail.store( magic + 1, std::memory_order_release ); + } +#endif + { + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneEnd ); + tracy::MemWrite( &item->zoneEnd.time, tracy::Profiler::GetTime() ); + tail.store( magic + 1, std::memory_order_release ); + } +} + +TRACY_API void ___tracy_emit_zone_text( TracyCZoneCtx ctx, const char* txt, size_t size ) +{ + if( !ctx.active ) return; + auto ptr = (char*)tracy::tracy_malloc( size+1 ); + memcpy( ptr, txt, size ); + ptr[size] = '\0'; +#ifndef TRACY_NO_VERIFY + { + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, ctx.id ); + tail.store( magic + 1, std::memory_order_release ); + } +#endif + { + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneText ); + tracy::MemWrite( &item->zoneText.text, (uint64_t)ptr ); + tail.store( magic + 1, std::memory_order_release ); + } +} + +TRACY_API void ___tracy_emit_zone_name( TracyCZoneCtx ctx, const char* txt, size_t size ) +{ + if( !ctx.active ) return; + auto ptr = (char*)tracy::tracy_malloc( size+1 ); + memcpy( ptr, txt, size ); + ptr[size] = '\0'; +#ifndef TRACY_NO_VERIFY + { + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, ctx.id ); + tail.store( magic + 1, std::memory_order_release ); + } +#endif + { + tracy::Magic magic; + auto token = tracy::GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::ZoneName ); + tracy::MemWrite( &item->zoneText.text, (uint64_t)ptr ); + tail.store( magic + 1, std::memory_order_release ); + } +} + +TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size ) { tracy::Profiler::MemAlloc( ptr, size ); } +TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int depth ) { tracy::Profiler::MemAllocCallstack( ptr, size, depth ); } +TRACY_API void ___tracy_emit_memory_free( const void* ptr ) { tracy::Profiler::MemFree( ptr ); } +TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int depth ) { tracy::Profiler::MemFreeCallstack( ptr, depth ); } +TRACY_API void ___tracy_emit_frame_mark( const char* name ) { tracy::Profiler::SendFrameMark( name ); } +TRACY_API void ___tracy_emit_frame_mark_start( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart ); } +TRACY_API void ___tracy_emit_frame_mark_end( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd ); } +TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int flip ) { tracy::Profiler::SendFrameImage( image, w, h, offset, flip ); } +TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); } +TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); } +TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); } +TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); } +TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, color, callstack ); } +TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ) { tracy::Profiler::MessageAppInfo( txt, size ); } + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libs/tracy/client/TracyProfiler.hpp b/libs/tracy/client/TracyProfiler.hpp @@ -0,0 +1,620 @@ +#ifndef __TRACYPROFILER_HPP__ +#define __TRACYPROFILER_HPP__ + +#include <assert.h> +#include <atomic> +#include <stdint.h> +#include <string.h> + +#include "tracy_concurrentqueue.h" +#include "TracyCallstack.hpp" +#include "TracySysTime.hpp" +#include "TracyFastVector.hpp" +#include "../common/TracyQueue.hpp" +#include "../common/TracyAlign.hpp" +#include "../common/TracyAlloc.hpp" +#include "../common/TracyMutex.hpp" + +#if defined _WIN32 || defined __CYGWIN__ +# include <intrin.h> +#endif +#ifdef __APPLE__ +# include <TargetConditionals.h> +# include <mach/mach_time.h> +#endif + +#if defined _WIN32 || defined __CYGWIN__ || ( ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) && !defined __ANDROID__ ) || __ARM_ARCH >= 6 +# define TRACY_HW_TIMER +#endif + +#if !defined TRACY_HW_TIMER || ( __ARM_ARCH >= 6 && !defined CLOCK_MONOTONIC_RAW ) + #include <chrono> +#endif + +#ifndef TracyConcat +# define TracyConcat(x,y) TracyConcatIndirect(x,y) +#endif +#ifndef TracyConcatIndirect +# define TracyConcatIndirect(x,y) x##y +#endif + +namespace tracy +{ + +class GpuCtx; +class Profiler; +class Socket; +class UdpBroadcast; + +struct GpuCtxWrapper +{ + GpuCtx* ptr; +}; + +TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken(); +TRACY_API Profiler& GetProfiler(); +TRACY_API std::atomic<uint32_t>& GetLockCounter(); +TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter(); +TRACY_API GpuCtxWrapper& GetGpuCtx(); +TRACY_API uint64_t GetThreadHandle(); + +TRACY_API void InitRPMallocThread(); + +struct SourceLocationData +{ + const char* name; + const char* function; + const char* file; + uint32_t line; + uint32_t color; +}; + +#ifdef TRACY_ON_DEMAND +struct LuaZoneState +{ + uint32_t counter; + bool active; +}; +#endif + +using Magic = moodycamel::ConcurrentQueueDefaultTraits::index_t; + + +typedef void(*ParameterCallback)( uint32_t idx, int32_t val ); + +class Profiler +{ + struct FrameImageQueueItem + { + void* image; + uint64_t frame; + uint16_t w; + uint16_t h; + uint8_t offset; + bool flip; + }; + +public: + Profiler(); + ~Profiler(); + + static tracy_force_inline int64_t GetTime() + { +#ifdef TRACY_HW_TIMER +# if TARGET_OS_IOS == 1 + return mach_absolute_time(); +# elif __ARM_ARCH >= 6 +# ifdef CLOCK_MONOTONIC_RAW + struct timespec ts; + clock_gettime( CLOCK_MONOTONIC_RAW, &ts ); + return int64_t( ts.tv_sec ) * 1000000000ll + int64_t( ts.tv_nsec ); +# else + return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count(); +# endif +# elif defined _WIN32 || defined __CYGWIN__ + return int64_t( __rdtsc() ); +# elif defined __i386 || defined _M_IX86 + uint32_t eax, edx; + asm volatile ( "rdtsc" : "=a" (eax), "=d" (edx) ); + return ( uint64_t( edx ) << 32 ) + uint64_t( eax ); +# elif defined __x86_64__ || defined _M_X64 + uint64_t rax, rdx; + asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) ); + return ( rdx << 32 ) + rax; +# endif +#else + return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count(); +#endif + } + + tracy_force_inline uint32_t GetNextZoneId() + { + return m_zoneId.fetch_add( 1, std::memory_order_relaxed ); + } + + static tracy_force_inline QueueItem* QueueSerial() + { + auto& p = GetProfiler(); + p.m_serialLock.lock(); + return p.m_serialQueue.prepare_next(); + } + + static tracy_force_inline void QueueSerialFinish() + { + auto& p = GetProfiler(); + p.m_serialQueue.commit_next(); + p.m_serialLock.unlock(); + } + + static tracy_force_inline void SendFrameMark( const char* name ) + { + if( !name ) GetProfiler().m_frameCount.fetch_add( 1, std::memory_order_relaxed ); +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::FrameMarkMsg ); + MemWrite( &item->frameMark.time, GetTime() ); + MemWrite( &item->frameMark.name, uint64_t( name ) ); + tail.store( magic + 1, std::memory_order_release ); + } + + static tracy_force_inline void SendFrameMark( const char* name, QueueType type ) + { + assert( type == QueueType::FrameMarkMsgStart || type == QueueType::FrameMarkMsgEnd ); +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + auto item = QueueSerial(); + MemWrite( &item->hdr.type, type ); + MemWrite( &item->frameMark.time, GetTime() ); + MemWrite( &item->frameMark.name, uint64_t( name ) ); + QueueSerialFinish(); + } + + static tracy_force_inline void SendFrameImage( const void* image, uint16_t w, uint16_t h, uint8_t offset, bool flip ) + { + auto& profiler = GetProfiler(); +#ifdef TRACY_ON_DEMAND + if( !profiler.IsConnected() ) return; +#endif + const auto sz = size_t( w ) * size_t( h ) * 4; + auto ptr = (char*)tracy_malloc( sz ); + memcpy( ptr, image, sz ); + + profiler.m_fiLock.lock(); + auto fi = profiler.m_fiQueue.prepare_next(); + fi->image = ptr; + fi->frame = profiler.m_frameCount.load( std::memory_order_relaxed ) - offset; + fi->w = w; + fi->h = h; + fi->flip = flip; + profiler.m_fiQueue.commit_next(); + profiler.m_fiLock.unlock(); + } + + static tracy_force_inline void PlotData( const char* name, int64_t val ) + { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::PlotData ); + MemWrite( &item->plotData.name, (uint64_t)name ); + MemWrite( &item->plotData.time, GetTime() ); + MemWrite( &item->plotData.type, PlotDataType::Int ); + MemWrite( &item->plotData.data.i, val ); + tail.store( magic + 1, std::memory_order_release ); + } + + static tracy_force_inline void PlotData( const char* name, float val ) + { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::PlotData ); + MemWrite( &item->plotData.name, (uint64_t)name ); + MemWrite( &item->plotData.time, GetTime() ); + MemWrite( &item->plotData.type, PlotDataType::Float ); + MemWrite( &item->plotData.data.f, val ); + tail.store( magic + 1, std::memory_order_release ); + } + + static tracy_force_inline void PlotData( const char* name, double val ) + { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::PlotData ); + MemWrite( &item->plotData.name, (uint64_t)name ); + MemWrite( &item->plotData.time, GetTime() ); + MemWrite( &item->plotData.type, PlotDataType::Double ); + MemWrite( &item->plotData.data.d, val ); + tail.store( magic + 1, std::memory_order_release ); + } + + static tracy_force_inline void ConfigurePlot( const char* name, PlotFormatType type ) + { + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::PlotConfig ); + MemWrite( &item->plotConfig.name, (uint64_t)name ); + MemWrite( &item->plotConfig.type, (uint8_t)type ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + tail.store( magic + 1, std::memory_order_release ); + } + + static tracy_force_inline void Message( const char* txt, size_t size, int callstack ) + { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + Magic magic; + auto token = GetToken(); + auto ptr = (char*)tracy_malloc( size+1 ); + memcpy( ptr, txt, size ); + ptr[size] = '\0'; + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, callstack == 0 ? QueueType::Message : QueueType::MessageCallstack ); + MemWrite( &item->message.time, GetTime() ); + MemWrite( &item->message.text, (uint64_t)ptr ); + tail.store( magic + 1, std::memory_order_release ); + + if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack ); + } + + static tracy_force_inline void Message( const char* txt, int callstack ) + { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack ); + MemWrite( &item->message.time, GetTime() ); + MemWrite( &item->message.text, (uint64_t)txt ); + tail.store( magic + 1, std::memory_order_release ); + + if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack ); + } + + static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int callstack ) + { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + Magic magic; + auto token = GetToken(); + auto ptr = (char*)tracy_malloc( size+1 ); + memcpy( ptr, txt, size ); + ptr[size] = '\0'; + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack ); + MemWrite( &item->messageColor.time, GetTime() ); + MemWrite( &item->messageColor.text, (uint64_t)ptr ); + MemWrite( &item->messageColor.r, uint8_t( ( color ) & 0xFF ) ); + MemWrite( &item->messageColor.g, uint8_t( ( color >> 8 ) & 0xFF ) ); + MemWrite( &item->messageColor.b, uint8_t( ( color >> 16 ) & 0xFF ) ); + tail.store( magic + 1, std::memory_order_release ); + + if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack ); + } + + static tracy_force_inline void MessageColor( const char* txt, uint32_t color, int callstack ) + { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack ); + MemWrite( &item->messageColor.time, GetTime() ); + MemWrite( &item->messageColor.text, (uint64_t)txt ); + MemWrite( &item->messageColor.r, uint8_t( ( color ) & 0xFF ) ); + MemWrite( &item->messageColor.g, uint8_t( ( color >> 8 ) & 0xFF ) ); + MemWrite( &item->messageColor.b, uint8_t( ( color >> 16 ) & 0xFF ) ); + tail.store( magic + 1, std::memory_order_release ); + + if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack ); + } + + static tracy_force_inline void MessageAppInfo( const char* txt, size_t size ) + { + Magic magic; + auto token = GetToken(); + auto ptr = (char*)tracy_malloc( size+1 ); + memcpy( ptr, txt, size ); + ptr[size] = '\0'; + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::MessageAppInfo ); + MemWrite( &item->message.time, GetTime() ); + MemWrite( &item->message.text, (uint64_t)ptr ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + tail.store( magic + 1, std::memory_order_release ); + } + + static tracy_force_inline void MemAlloc( const void* ptr, size_t size ) + { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + const auto thread = GetThreadHandle(); + + GetProfiler().m_serialLock.lock(); + SendMemAlloc( QueueType::MemAlloc, thread, ptr, size ); + GetProfiler().m_serialLock.unlock(); + } + + static tracy_force_inline void MemFree( const void* ptr ) + { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + const auto thread = GetThreadHandle(); + + GetProfiler().m_serialLock.lock(); + SendMemFree( QueueType::MemFree, thread, ptr ); + GetProfiler().m_serialLock.unlock(); + } + + static tracy_force_inline void MemAllocCallstack( const void* ptr, size_t size, int depth ) + { +#ifdef TRACY_HAS_CALLSTACK + auto& profiler = GetProfiler(); +# ifdef TRACY_ON_DEMAND + if( !profiler.IsConnected() ) return; +# endif + const auto thread = GetThreadHandle(); + + rpmalloc_thread_initialize(); + auto callstack = Callstack( depth ); + + profiler.m_serialLock.lock(); + SendMemAlloc( QueueType::MemAllocCallstack, thread, ptr, size ); + SendCallstackMemory( callstack ); + profiler.m_serialLock.unlock(); +#else + MemAlloc( ptr, size ); +#endif + } + + static tracy_force_inline void MemFreeCallstack( const void* ptr, int depth ) + { +#ifdef TRACY_HAS_CALLSTACK + auto& profiler = GetProfiler(); +# ifdef TRACY_ON_DEMAND + if( !profiler.IsConnected() ) return; +# endif + const auto thread = GetThreadHandle(); + + rpmalloc_thread_initialize(); + auto callstack = Callstack( depth ); + + profiler.m_serialLock.lock(); + SendMemFree( QueueType::MemFreeCallstack, thread, ptr ); + SendCallstackMemory( callstack ); + profiler.m_serialLock.unlock(); +#else + MemFree( ptr ); +#endif + } + + static tracy_force_inline void SendCallstack( int depth ) + { +#ifdef TRACY_HAS_CALLSTACK + auto ptr = Callstack( depth ); + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::Callstack ); + MemWrite( &item->callstack.ptr, ptr ); + tail.store( magic + 1, std::memory_order_release ); +#endif + } + + static void ParameterRegister( ParameterCallback cb ) { GetProfiler().m_paramCallback = cb; } + static void ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val ); + + void SendCallstack( int depth, const char* skipBefore ); + static void CutCallstack( void* callstack, const char* skipBefore ); + + static bool ShouldExit(); + +#ifdef TRACY_ON_DEMAND + tracy_force_inline bool IsConnected() const + { + return m_isConnected.load( std::memory_order_acquire ); + } + + tracy_force_inline uint64_t ConnectionId() const + { + return m_connectionId.load( std::memory_order_acquire ); + } + + tracy_force_inline void DeferItem( const QueueItem& item ) + { + m_deferredLock.lock(); + auto dst = m_deferredQueue.push_next(); + memcpy( dst, &item, sizeof( item ) ); + m_deferredLock.unlock(); + } +#endif + + void RequestShutdown() { m_shutdown.store( true, std::memory_order_relaxed ); m_shutdownManual.store( true, std::memory_order_relaxed ); } + bool HasShutdownFinished() const { return m_shutdownFinished.load( std::memory_order_relaxed ); } + + void SendString( uint64_t ptr, const char* str, QueueType type ); + +private: + enum class DequeueStatus { DataDequeued, ConnectionLost, QueueEmpty }; + + static void LaunchWorker( void* ptr ) { ((Profiler*)ptr)->Worker(); } + void Worker(); + + static void LaunchCompressWorker( void* ptr ) { ((Profiler*)ptr)->CompressWorker(); } + void CompressWorker(); + + void ClearQueues( tracy::moodycamel::ConsumerToken& token ); + void ClearSerial(); + DequeueStatus Dequeue( tracy::moodycamel::ConsumerToken& token ); + DequeueStatus DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop ); + DequeueStatus DequeueSerial(); + bool AppendData( const void* data, size_t len ); + bool CommitData(); + bool NeedDataSize( size_t len ); + + tracy_force_inline void AppendDataUnsafe( const void* data, size_t len ) + { + memcpy( m_buffer + m_bufferOffset, data, len ); + m_bufferOffset += int( len ); + } + + bool SendData( const char* data, size_t len ); + void SendLongString( uint64_t ptr, const char* str, size_t len, QueueType type ); + void SendSourceLocation( uint64_t ptr ); + void SendSourceLocationPayload( uint64_t ptr ); + void SendCallstackPayload( uint64_t ptr ); + void SendCallstackAlloc( uint64_t ptr ); + void SendCallstackFrame( uint64_t ptr ); + + bool HandleServerQuery(); + void HandleDisconnect(); + void HandleParameter( uint64_t payload ); + + void CalibrateTimer(); + void CalibrateDelay(); + + static tracy_force_inline void SendCallstackMemory( void* ptr ) + { +#ifdef TRACY_HAS_CALLSTACK + auto item = GetProfiler().m_serialQueue.prepare_next(); + MemWrite( &item->hdr.type, QueueType::CallstackMemory ); + MemWrite( &item->callstackMemory.ptr, (uint64_t)ptr ); + GetProfiler().m_serialQueue.commit_next(); +#endif + } + + static tracy_force_inline void SendMemAlloc( QueueType type, const uint64_t thread, const void* ptr, size_t size ) + { + assert( type == QueueType::MemAlloc || type == QueueType::MemAllocCallstack ); + + auto item = GetProfiler().m_serialQueue.prepare_next(); + MemWrite( &item->hdr.type, type ); + MemWrite( &item->memAlloc.time, GetTime() ); + MemWrite( &item->memAlloc.thread, thread ); + MemWrite( &item->memAlloc.ptr, (uint64_t)ptr ); + if( compile_time_condition<sizeof( size ) == 4>::value ) + { + memcpy( &item->memAlloc.size, &size, 4 ); + memset( &item->memAlloc.size + 4, 0, 2 ); + } + else + { + assert( sizeof( size ) == 8 ); + memcpy( &item->memAlloc.size, &size, 6 ); + } + GetProfiler().m_serialQueue.commit_next(); + } + + static tracy_force_inline void SendMemFree( QueueType type, const uint64_t thread, const void* ptr ) + { + assert( type == QueueType::MemFree || type == QueueType::MemFreeCallstack ); + + auto item = GetProfiler().m_serialQueue.prepare_next(); + MemWrite( &item->hdr.type, type ); + MemWrite( &item->memFree.time, GetTime() ); + MemWrite( &item->memFree.thread, thread ); + MemWrite( &item->memFree.ptr, (uint64_t)ptr ); + GetProfiler().m_serialQueue.commit_next(); + } + + double m_timerMul; + uint64_t m_resolution; + uint64_t m_delay; + std::atomic<int64_t> m_timeBegin; + uint64_t m_mainThread; + uint64_t m_epoch; + std::atomic<bool> m_shutdown; + std::atomic<bool> m_shutdownManual; + std::atomic<bool> m_shutdownFinished; + Socket* m_sock; + UdpBroadcast* m_broadcast; + bool m_noExit; + std::atomic<uint32_t> m_zoneId; + + uint64_t m_threadCtx; + int64_t m_refTimeThread; + int64_t m_refTimeSerial; + int64_t m_refTimeCtx; + int64_t m_refTimeGpu; + + void* m_stream; // LZ4_stream_t* + char* m_buffer; + int m_bufferOffset; + int m_bufferStart; + + QueueItem* m_itemBuf; + char* m_lz4Buf; + + FastVector<QueueItem> m_serialQueue, m_serialDequeue; + TracyMutex m_serialLock; + + FastVector<FrameImageQueueItem> m_fiQueue, m_fiDequeue; + TracyMutex m_fiLock; + + std::atomic<uint64_t> m_frameCount; +#ifdef TRACY_ON_DEMAND + std::atomic<bool> m_isConnected; + std::atomic<uint64_t> m_connectionId; + + TracyMutex m_deferredLock; + FastVector<QueueItem> m_deferredQueue; +#endif + +#ifdef TRACY_HAS_SYSTIME + void ProcessSysTime(); + + SysTime m_sysTime; + uint64_t m_sysTimeLast = 0; +#else + void ProcessSysTime() {} +#endif + + ParameterCallback m_paramCallback; +}; + +}; + +#endif diff --git a/libs/tracy/client/TracyScoped.hpp b/libs/tracy/client/TracyScoped.hpp @@ -0,0 +1,119 @@ +#ifndef __TRACYSCOPED_HPP__ +#define __TRACYSCOPED_HPP__ + +#include <stdint.h> +#include <string.h> + +#include "../common/TracySystem.hpp" +#include "../common/TracyAlign.hpp" +#include "../common/TracyAlloc.hpp" +#include "TracyProfiler.hpp" + +namespace tracy +{ + +class ScopedZone +{ +public: + tracy_force_inline ScopedZone( const SourceLocationData* srcloc, bool is_active = true ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) + , m_connectionId( GetProfiler().ConnectionId() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ZoneBegin ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); + tail.store( magic + 1, std::memory_order_release ); + } + + tracy_force_inline ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active = true ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) + , m_connectionId( GetProfiler().ConnectionId() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ZoneBeginCallstack ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); + tail.store( magic + 1, std::memory_order_release ); + + GetProfiler().SendCallstack( depth ); + } + + tracy_force_inline ~ScopedZone() + { + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + if( GetProfiler().ConnectionId() != m_connectionId ) return; +#endif + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ZoneEnd ); + MemWrite( &item->zoneEnd.time, Profiler::GetTime() ); + tail.store( magic + 1, std::memory_order_release ); + } + + tracy_force_inline void Text( const char* txt, size_t size ) + { + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + if( GetProfiler().ConnectionId() != m_connectionId ) return; +#endif + Magic magic; + auto token = GetToken(); + auto ptr = (char*)tracy_malloc( size+1 ); + memcpy( ptr, txt, size ); + ptr[size] = '\0'; + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ZoneText ); + MemWrite( &item->zoneText.text, (uint64_t)ptr ); + tail.store( magic + 1, std::memory_order_release ); + } + + tracy_force_inline void Name( const char* txt, size_t size ) + { + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + if( GetProfiler().ConnectionId() != m_connectionId ) return; +#endif + Magic magic; + auto token = GetToken(); + auto ptr = (char*)tracy_malloc( size+1 ); + memcpy( ptr, txt, size ); + ptr[size] = '\0'; + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ZoneName ); + MemWrite( &item->zoneText.text, (uint64_t)ptr ); + tail.store( magic + 1, std::memory_order_release ); + } + +private: + const bool m_active; + +#ifdef TRACY_ON_DEMAND + uint64_t m_connectionId; +#endif +}; + +} + +#endif diff --git a/libs/tracy/client/TracySysTime.cpp b/libs/tracy/client/TracySysTime.cpp @@ -0,0 +1,105 @@ +#include "TracySysTime.hpp" + +#ifdef TRACY_HAS_SYSTIME + +# if defined _WIN32 || defined __CYGWIN__ +# include <windows.h> +# elif defined __linux__ +# include <stdio.h> +# include <inttypes.h> +# elif defined __APPLE__ +# include <mach/mach_host.h> +# include <mach/host_info.h> +# elif defined BSD +# include <sys/types.h> +# include <sys/sysctl.h> +# endif + +namespace tracy +{ + +# if defined _WIN32 || defined __CYGWIN__ + +static inline uint64_t ConvertTime( const FILETIME& t ) +{ + return ( uint64_t( t.dwHighDateTime ) << 32 ) | uint64_t( t.dwLowDateTime ); +} + +void SysTime::ReadTimes() +{ + FILETIME idleTime; + FILETIME kernelTime; + FILETIME userTime; + + GetSystemTimes( &idleTime, &kernelTime, &userTime ); + + idle = ConvertTime( idleTime ); + const auto kernel = ConvertTime( kernelTime ); + const auto user = ConvertTime( userTime ); + used = kernel + user; +} + +# elif defined __linux__ + +void SysTime::ReadTimes() +{ + uint64_t user, nice, system; + FILE* f = fopen( "/proc/stat", "r" ); + if( f ) + { + fscanf( f, "cpu %" PRIu64 " %" PRIu64 " %" PRIu64" %" PRIu64, &user, &nice, &system, &idle ); + fclose( f ); + used = user + nice + system; + } +} + +# elif defined __APPLE__ + +void SysTime::ReadTimes() +{ + host_cpu_load_info_data_t info; + mach_msg_type_number_t cnt = HOST_CPU_LOAD_INFO_COUNT; + host_statistics( mach_host_self(), HOST_CPU_LOAD_INFO, reinterpret_cast<host_info_t>( &info ), &cnt ); + used = info.cpu_ticks[CPU_STATE_USER] + info.cpu_ticks[CPU_STATE_NICE] + info.cpu_ticks[CPU_STATE_SYSTEM]; + idle = info.cpu_ticks[CPU_STATE_IDLE]; +} + +# elif defined BSD + +void SysTime::ReadTimes() +{ + u_long data[5]; + size_t sz = sizeof( data ); + sysctlbyname( "kern.cp_time", &data, &sz, nullptr, 0 ); + used = data[0] + data[1] + data[2] + data[3]; + idle = data[4]; +} + +#endif + +SysTime::SysTime() +{ + ReadTimes(); +} + +float SysTime::Get() +{ + const auto oldUsed = used; + const auto oldIdle = idle; + + ReadTimes(); + + const auto diffIdle = idle - oldIdle; + const auto diffUsed = used - oldUsed; + +#if defined _WIN32 || defined __CYGWIN__ + return diffUsed == 0 ? -1 : ( diffUsed - diffIdle ) * 100.f / diffUsed; +#elif defined __linux__ || defined __APPLE__ || defined BSD + const auto total = diffUsed + diffIdle; + return total == 0 ? -1 : diffUsed * 100.f / total; +#endif +} + +} + +#endif diff --git a/libs/tracy/client/TracySysTime.hpp b/libs/tracy/client/TracySysTime.hpp @@ -0,0 +1,36 @@ +#ifndef __TRACYSYSTIME_HPP__ +#define __TRACYSYSTIME_HPP__ + +#if defined _WIN32 || defined __CYGWIN__ || defined __linux__ || defined __APPLE__ +# define TRACY_HAS_SYSTIME +#else +# include <sys/param.h> +#endif + +#ifdef BSD +# define TRACY_HAS_SYSTIME +#endif + +#ifdef TRACY_HAS_SYSTIME + +#include <stdint.h> + +namespace tracy +{ + +class SysTime +{ +public: + SysTime(); + float Get(); + + void ReadTimes(); + +private: + uint64_t idle, used; +}; + +} +#endif + +#endif diff --git a/libs/tracy/client/TracySysTrace.cpp b/libs/tracy/client/TracySysTrace.cpp @@ -0,0 +1,862 @@ +#include "TracySysTrace.hpp" + +#ifdef TRACY_HAS_SYSTEM_TRACING + +# if defined _WIN32 || defined __CYGWIN__ + +# ifndef NOMINMAX +# define NOMINMAX +# endif + +# define INITGUID +# include <assert.h> +# include <string.h> +# include <windows.h> +# include <dbghelp.h> +# include <evntrace.h> +# include <evntcons.h> +# include <psapi.h> +# include <winternl.h> + +# include "../common/TracyAlloc.hpp" +# include "../common/TracySystem.hpp" +# include "TracyProfiler.hpp" + +namespace tracy +{ + +TRACEHANDLE s_traceHandle; +TRACEHANDLE s_traceHandle2; +EVENT_TRACE_PROPERTIES* s_prop; + +struct CSwitch +{ + uint32_t newThreadId; + uint32_t oldThreadId; + int8_t newThreadPriority; + int8_t oldThreadPriority; + uint8_t previousCState; + int8_t spareByte; + int8_t oldThreadWaitReason; + int8_t oldThreadWaitMode; + int8_t oldThreadState; + int8_t oldThreadWaitIdealProcessor; + uint32_t newThreadWaitTime; + uint32_t reserved; +}; + +struct ReadyThread +{ + uint32_t threadId; + int8_t adjustReason; + int8_t adjustIncrement; + int8_t flag; + int8_t reserverd; +}; + +void WINAPI EventRecordCallback( PEVENT_RECORD record ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + + const auto& hdr = record->EventHeader; + if( hdr.EventDescriptor.Opcode == 36 ) + { + const auto cswitch = (const CSwitch*)record->UserData; + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ContextSwitch ); + MemWrite( &item->contextSwitch.time, hdr.TimeStamp.QuadPart ); + memcpy( &item->contextSwitch.oldThread, &cswitch->oldThreadId, sizeof( cswitch->oldThreadId ) ); + memcpy( &item->contextSwitch.newThread, &cswitch->newThreadId, sizeof( cswitch->newThreadId ) ); + memset( ((char*)&item->contextSwitch.oldThread)+4, 0, 4 ); + memset( ((char*)&item->contextSwitch.newThread)+4, 0, 4 ); + MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber ); + MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason ); + MemWrite( &item->contextSwitch.state, cswitch->oldThreadState ); + tail.store( magic + 1, std::memory_order_release ); + } + else if( hdr.EventDescriptor.Opcode == 50 ) + { + const auto rt = (const ReadyThread*)record->UserData; + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ThreadWakeup ); + MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart ); + memcpy( &item->threadWakeup.thread, &rt->threadId, sizeof( rt->threadId ) ); + memset( ((char*)&item->threadWakeup.thread)+4, 0, 4 ); + tail.store( magic + 1, std::memory_order_release ); + } +} + +bool SysTraceStart() +{ + TOKEN_PRIVILEGES priv = {}; + priv.PrivilegeCount = 1; + priv.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if( LookupPrivilegeValue( nullptr, SE_SYSTEM_PROFILE_NAME, &priv.Privileges[0].Luid ) == 0 ) return false; + + HANDLE pt; + if( OpenProcessToken( GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &pt ) == 0 ) return false; + const auto adjust = AdjustTokenPrivileges( pt, FALSE, &priv, 0, nullptr, nullptr ); + CloseHandle( pt ); + if( adjust == 0 ) return false; + const auto status = GetLastError(); + if( status != ERROR_SUCCESS ) return false; + + const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + sizeof( KERNEL_LOGGER_NAME ); + s_prop = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz ); + memset( s_prop, 0, sizeof( EVENT_TRACE_PROPERTIES ) ); + s_prop->EnableFlags = EVENT_TRACE_FLAG_CSWITCH | EVENT_TRACE_FLAG_DISPATCHER; + s_prop->LogFileMode = EVENT_TRACE_REAL_TIME_MODE; + s_prop->Wnode.BufferSize = psz; + s_prop->Wnode.Flags = WNODE_FLAG_TRACED_GUID; + s_prop->Wnode.ClientContext = 3; + s_prop->Wnode.Guid = SystemTraceControlGuid; + s_prop->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES ); + memcpy( ((char*)s_prop) + sizeof( EVENT_TRACE_PROPERTIES ), KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) ); + + auto backup = tracy_malloc( psz ); + memcpy( backup, s_prop, psz ); + + const auto controlStatus = ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP ); + if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND ) + { + tracy_free( s_prop ); + return false; + } + + memcpy( s_prop, backup, psz ); + tracy_free( backup ); + + const auto startStatus = StartTrace( &s_traceHandle, KERNEL_LOGGER_NAME, s_prop ); + if( startStatus != ERROR_SUCCESS ) + { + tracy_free( s_prop ); + return false; + } + +#ifdef UNICODE + WCHAR KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )]; +#else + char KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )]; +#endif + memcpy( KernelLoggerName, KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) ); + EVENT_TRACE_LOGFILE log = {}; + log.LoggerName = KernelLoggerName; + log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP; + log.EventRecordCallback = EventRecordCallback; + + s_traceHandle2 = OpenTrace( &log ); + if( s_traceHandle2 == (TRACEHANDLE)INVALID_HANDLE_VALUE ) + { + CloseTrace( s_traceHandle ); + tracy_free( s_prop ); + return false; + } + + return true; +} + +void SysTraceStop() +{ + CloseTrace( s_traceHandle2 ); + CloseTrace( s_traceHandle ); +} + +void SysTraceWorker( void* ptr ) +{ + SetThreadName( "Tracy SysTrace" ); + ProcessTrace( &s_traceHandle2, 1, 0, 0 ); + ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP ); + tracy_free( s_prop ); +} + +#ifdef __CYGWIN__ +extern "C" typedef DWORD (WINAPI *t_GetProcessIdOfThread)( HANDLE ); +extern "C" typedef DWORD (WINAPI *t_GetProcessImageFileNameA)( HANDLE, LPSTR, DWORD ); +# ifdef UNICODE +t_GetProcessIdOfThread GetProcessIdOfThread = (t_GetProcessIdOfThread)GetProcAddress( GetModuleHandle( L"kernel32.dll" ), "GetProcessIdOfThread" ); +t_GetProcessImageFileNameA GetProcessImageFileNameA = (t_GetProcessImageFileNameA)GetProcAddress( GetModuleHandle( L"kernel32.dll" ), "K32GetProcessImageFileNameA" ); +# else +t_GetProcessIdOfThread GetProcessIdOfThread = (t_GetProcessIdOfThread)GetProcAddress( GetModuleHandle( "kernel32.dll" ), "GetProcessIdOfThread" ); +t_GetProcessImageFileNameA GetProcessImageFileNameA = (t_GetProcessImageFileNameA)GetProcAddress( GetModuleHandle( "kernel32.dll" ), "K32GetProcessImageFileNameA" ); +# endif +#endif + +extern "C" typedef NTSTATUS (WINAPI *t_NtQueryInformationThread)( HANDLE, THREADINFOCLASS, PVOID, ULONG, PULONG ); +extern "C" typedef BOOL (WINAPI *t_EnumProcessModules)( HANDLE, HMODULE*, DWORD, LPDWORD ); +extern "C" typedef BOOL (WINAPI *t_GetModuleInformation)( HANDLE, HMODULE, LPMODULEINFO, DWORD ); +extern "C" typedef DWORD (WINAPI *t_GetModuleBaseNameA)( HANDLE, HMODULE, LPSTR, DWORD ); +#ifdef UNICODE +t_NtQueryInformationThread NtQueryInformationThread = (t_NtQueryInformationThread)GetProcAddress( GetModuleHandle( L"ntdll.dll" ), "NtQueryInformationThread" ); +t_EnumProcessModules _EnumProcessModules = (t_EnumProcessModules)GetProcAddress( GetModuleHandle( L"kernel32.dll" ), "K32EnumProcessModules" ); +t_GetModuleInformation _GetModuleInformation = (t_GetModuleInformation)GetProcAddress( GetModuleHandle( L"kernel32.dll" ), "K32GetModuleInformation" ); +t_GetModuleBaseNameA _GetModuleBaseNameA = (t_GetModuleBaseNameA)GetProcAddress( GetModuleHandle( L"kernel32.dll" ), "K32GetModuleBaseNameA" ); +#else +t_NtQueryInformationThread NtQueryInformationThread = (t_NtQueryInformationThread)GetProcAddress( GetModuleHandle( "ntdll.dll" ), "NtQueryInformationThread" ); +t_EnumProcessModules _EnumProcessModules = (t_EnumProcessModules)GetProcAddress( GetModuleHandle( "kernel32.dll" ), "K32EnumProcessModules" ); +t_GetModuleInformation _GetModuleInformation = (t_GetModuleInformation)GetProcAddress( GetModuleHandle( "kernel32.dll" ), "K32GetModuleInformation" ); +t_GetModuleBaseNameA _GetModuleBaseNameA = (t_GetModuleBaseNameA)GetProcAddress( GetModuleHandle( "kernel32.dll" ), "K32GetModuleBaseNameA" ); +#endif + + +void SysTraceSendExternalName( uint64_t thread ) +{ + bool threadSent = false; + auto hnd = OpenThread( THREAD_QUERY_INFORMATION, FALSE, DWORD( thread ) ); + if( hnd == 0 ) + { + hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, DWORD( thread ) ); + } + if( hnd != 0 ) + { +#if defined NTDDI_WIN10_RS2 && NTDDI_VERSION >= NTDDI_WIN10_RS2 + PWSTR tmp; + GetThreadDescription( hnd, &tmp ); + char buf[256]; + if( tmp ) + { + auto ret = wcstombs( buf, tmp, 256 ); + if( ret != 0 ) + { + GetProfiler().SendString( thread, buf, QueueType::ExternalThreadName ); + threadSent = true; + } + } +#endif + const auto pid = GetProcessIdOfThread( hnd ); + if( !threadSent && NtQueryInformationThread && _EnumProcessModules && _GetModuleInformation && _GetModuleBaseNameA ) + { + void* ptr; + ULONG retlen; + auto status = NtQueryInformationThread( hnd, (THREADINFOCLASS)9 /*ThreadQuerySetWin32StartAddress*/, &ptr, sizeof( &ptr ), &retlen ); + if( status == 0 ) + { + const auto phnd = OpenProcess( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, pid ); + if( phnd != INVALID_HANDLE_VALUE ) + { + HMODULE modules[1024]; + DWORD needed; + if( _EnumProcessModules( phnd, modules, 1024 * sizeof( HMODULE ), &needed ) != 0 ) + { + const auto sz = std::min( DWORD( needed / sizeof( HMODULE ) ), DWORD( 1024 ) ); + for( DWORD i=0; i<sz; i++ ) + { + MODULEINFO info; + if( _GetModuleInformation( phnd, modules[i], &info, sizeof( info ) ) != 0 ) + { + if( (uint64_t)ptr >= (uint64_t)info.lpBaseOfDll && (uint64_t)ptr <= (uint64_t)info.lpBaseOfDll + (uint64_t)info.SizeOfImage ) + { + char buf[1024]; + if( _GetModuleBaseNameA( phnd, modules[i], buf, 1024 ) != 0 ) + { + GetProfiler().SendString( thread, buf, QueueType::ExternalThreadName ); + threadSent = true; + } + } + } + } + } + CloseHandle( phnd ); + } + } + } + CloseHandle( hnd ); + if( !threadSent ) + { + GetProfiler().SendString( thread, "???", QueueType::ExternalThreadName ); + threadSent = true; + } + if( pid != 0 ) + { + { + uint64_t _pid = pid; + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::TidToPid ); + MemWrite( &item->tidToPid.tid, thread ); + MemWrite( &item->tidToPid.pid, _pid ); + tail.store( magic + 1, std::memory_order_release ); + } + if( pid == 4 ) + { + GetProfiler().SendString( thread, "System", QueueType::ExternalName ); + return; + } + else + { + const auto phnd = OpenProcess( PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid ); + if( phnd != INVALID_HANDLE_VALUE ) + { + char buf[1024]; + const auto sz = GetProcessImageFileNameA( phnd, buf, 1024 ); + CloseHandle( phnd ); + if( sz != 0 ) + { + auto ptr = buf + sz - 1; + while( ptr > buf && *ptr != '\\' ) ptr--; + if( *ptr == '\\' ) ptr++; + GetProfiler().SendString( thread, ptr, QueueType::ExternalName ); + return; + } + } + } + } + } + + if( !threadSent ) + { + GetProfiler().SendString( thread, "???", QueueType::ExternalThreadName ); + } + GetProfiler().SendString( thread, "???", QueueType::ExternalName ); +} + +} + +# elif defined __linux__ + +# include <sys/types.h> +# include <sys/stat.h> +# include <sys/wait.h> +# include <fcntl.h> +# include <inttypes.h> +# include <limits> +# include <poll.h> +# include <stdio.h> +# include <stdlib.h> +# include <string.h> +# include <unistd.h> + +# include "TracyProfiler.hpp" + +# ifdef __ANDROID__ +# include "TracySysTracePayload.hpp" +# endif + +namespace tracy +{ + +static const char BasePath[] = "/sys/kernel/debug/tracing/"; +static const char TracingOn[] = "tracing_on"; +static const char CurrentTracer[] = "current_tracer"; +static const char TraceOptions[] = "trace_options"; +static const char TraceClock[] = "trace_clock"; +static const char SchedSwitch[] = "events/sched/sched_switch/enable"; +static const char SchedWakeup[] = "events/sched/sched_wakeup/enable"; +static const char BufferSizeKb[] = "buffer_size_kb"; +static const char TracePipe[] = "trace_pipe"; + +#ifdef __ANDROID__ +static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz ) +{ + char tmp[256]; + sprintf( tmp, "su -c 'echo \"%s\" > %s%s'", val, BasePath, path ); + return system( tmp ) == 0; +} +#else +static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz ) +{ + char tmp[256]; + memcpy( tmp, BasePath, sizeof( BasePath ) - 1 ); + memcpy( tmp + sizeof( BasePath ) - 1, path, psz ); + + int fd = open( tmp, O_WRONLY ); + if( fd < 0 ) return false; + + for(;;) + { + ssize_t cnt = write( fd, val, vsz ); + if( cnt == (ssize_t)vsz ) + { + close( fd ); + return true; + } + if( cnt < 0 ) + { + close( fd ); + return false; + } + vsz -= cnt; + val += cnt; + } +} +#endif + +#ifdef __ANDROID__ +void SysTraceInjectPayload() +{ + int pipefd[2]; + if( pipe( pipefd ) == 0 ) + { + const auto pid = fork(); + if( pid == 0 ) + { + // child + close( pipefd[1] ); + if( dup2( pipefd[0], STDIN_FILENO ) >= 0 ) + { + close( pipefd[0] ); + execlp( "su", "su", "-c", "cat > /data/tracy_systrace", (char*)nullptr ); + exit( 1 ); + } + } + else if( pid > 0 ) + { + // parent + close( pipefd[0] ); + +#ifdef __aarch64__ + write( pipefd[1], tracy_systrace_aarch64_data, tracy_systrace_aarch64_size ); +#else + write( pipefd[1], tracy_systrace_armv7_data, tracy_systrace_armv7_size ); +#endif + close( pipefd[1] ); + waitpid( pid, nullptr, 0 ); + + system( "su -c 'chmod 700 /data/tracy_systrace'" ); + } + } +} +#endif + +bool SysTraceStart() +{ + if( !TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 ) ) return false; + if( !TraceWrite( CurrentTracer, sizeof( CurrentTracer ), "nop", 4 ) ) return false; + TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-cmd", 13 ); + TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-tgid", 14 ); + TraceWrite( TraceOptions, sizeof( TraceOptions ), "noirq-info", 11 ); +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + if( !TraceWrite( TraceClock, sizeof( TraceClock ), "x86-tsc", 8 ) ) return false; +#elif __ARM_ARCH >= 6 + if( !TraceWrite( TraceClock, sizeof( TraceClock ), "mono_raw", 9 ) ) return false; +#endif + if( !TraceWrite( SchedSwitch, sizeof( SchedSwitch ), "1", 2 ) ) return false; + if( !TraceWrite( SchedWakeup, sizeof( SchedWakeup ), "1", 2 ) ) return false; + if( !TraceWrite( BufferSizeKb, sizeof( BufferSizeKb ), "512", 4 ) ) return false; + +#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH ) + SysTraceInjectPayload(); +#endif + + if( !TraceWrite( TracingOn, sizeof( TracingOn ), "1", 2 ) ) return false; + + return true; +} + +void SysTraceStop() +{ + TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 ); +} + +static uint64_t ReadNumber( const char*& ptr ) +{ + uint64_t val = 0; + for(;;) + { + if( *ptr >= '0' && *ptr <= '9' ) + { + val = val * 10 + ( *ptr - '0' ); + ptr++; + } + else + { + return val; + } + } +} + +static uint8_t ReadState( char state ) +{ + switch( state ) + { + case 'D': return 101; + case 'I': return 102; + case 'R': return 103; + case 'S': return 104; + case 'T': return 105; + case 't': return 106; + case 'W': return 107; + case 'X': return 108; + case 'Z': return 109; + default: return 100; + } +} + +#if defined __ANDROID__ && defined __ANDROID_API__ && __ANDROID_API__ < 18 +/*- + * Copyright (c) 2011 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Christos Zoulas. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +ssize_t getdelim(char **buf, size_t *bufsiz, int delimiter, FILE *fp) +{ + char *ptr, *eptr; + + if (*buf == NULL || *bufsiz == 0) { + *bufsiz = BUFSIZ; + if ((*buf = (char*)malloc(*bufsiz)) == NULL) + return -1; + } + + for (ptr = *buf, eptr = *buf + *bufsiz;;) { + int c = fgetc(fp); + if (c == -1) { + if (feof(fp)) + return ptr == *buf ? -1 : ptr - *buf; + else + return -1; + } + *ptr++ = c; + if (c == delimiter) { + *ptr = '\0'; + return ptr - *buf; + } + if (ptr + 2 >= eptr) { + char *nbuf; + size_t nbufsiz = *bufsiz * 2; + ssize_t d = ptr - *buf; + if ((nbuf = (char*)realloc(*buf, nbufsiz)) == NULL) + return -1; + *buf = nbuf; + *bufsiz = nbufsiz; + eptr = nbuf + nbufsiz; + ptr = nbuf + d; + } + } +} + +ssize_t getline(char **buf, size_t *bufsiz, FILE *fp) +{ + return getdelim(buf, bufsiz, '\n', fp); +} +#endif + +static void HandleTraceLine( const char* line ) +{ + line += 24; + const auto cpu = (uint8_t)ReadNumber( line ); + + line++; // ']' + while( *line == ' ' ) line++; + +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + const auto time = ReadNumber( line ); +#elif __ARM_ARCH >= 6 + const auto ts = ReadNumber( line ); + line++; // '.' + const auto tus = ReadNumber( line ); + const auto time = ts * 1000000000ll + tus * 1000ll; +#endif + + line += 2; // ': ' + if( memcmp( line, "sched_switch", 12 ) == 0 ) + { + line += 14; + + while( memcmp( line, "prev_pid", 8 ) != 0 ) line++; + line += 9; + + const auto oldPid = ReadNumber( line ); + line++; + + while( memcmp( line, "prev_state", 10 ) != 0 ) line++; + line += 11; + + const auto oldState = (uint8_t)ReadState( *line ); + line += 5; + + while( memcmp( line, "next_pid", 8 ) != 0 ) line++; + line += 9; + + const auto newPid = ReadNumber( line ); + + uint8_t reason = 100; + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ContextSwitch ); + MemWrite( &item->contextSwitch.time, time ); + MemWrite( &item->contextSwitch.oldThread, oldPid ); + MemWrite( &item->contextSwitch.newThread, newPid ); + MemWrite( &item->contextSwitch.cpu, cpu ); + MemWrite( &item->contextSwitch.reason, reason ); + MemWrite( &item->contextSwitch.state, oldState ); + tail.store( magic + 1, std::memory_order_release ); + } + else if( memcmp( line, "sched_wakeup", 12 ) == 0 ) + { + line += 14; + + while( memcmp( line, "pid", 3 ) != 0 ) line++; + line += 4; + + const auto pid = ReadNumber( line ); + + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::ThreadWakeup ); + MemWrite( &item->threadWakeup.time, time ); + MemWrite( &item->threadWakeup.thread, pid ); + tail.store( magic + 1, std::memory_order_release ); + } +} + +#ifdef __ANDROID__ +static void ProcessTraceLines( int fd ) +{ + // Linux pipe buffer is 64KB, additional 1KB is for unfinished lines + char* buf = (char*)tracy_malloc( (64+1)*1024 ); + char* line = buf; + + for(;;) + { + const auto rd = read( fd, line, 64*1024 ); + if( rd <= 0 ) break; + +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) + { + if( rd < 64*1024 ) + { + assert( line[rd-1] == '\n' ); + line = buf; + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + else + { + const auto end = line + rd; + line = end - 1; + while( line > buf && *line != '\n' ) line--; + if( line > buf ) + { + line++; + const auto lsz = end - line; + memmove( buf, line, lsz ); + line = buf + lsz; + } + } + continue; + } +#endif + + const auto end = line + rd; + line = buf; + for(;;) + { + auto next = line; + while( next < end && *next != '\n' ) next++; + next++; + if( next >= end ) + { + const auto lsz = end - line; + memmove( buf, line, lsz ); + line = buf + lsz; + break; + } + + HandleTraceLine( line ); + line = next; + } + if( rd < 64*1024 ) + { + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } + + tracy_free( buf ); +} + +void SysTraceWorker( void* ptr ) +{ + SetThreadName( "Tracy SysTrace" ); + int pipefd[2]; + if( pipe( pipefd ) == 0 ) + { + const auto pid = fork(); + if( pid == 0 ) + { + // child + close( pipefd[0] ); + dup2( pipefd[1], STDERR_FILENO ); + if( dup2( pipefd[1], STDOUT_FILENO ) >= 0 ) + { + close( pipefd[1] ); +#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH ) + execlp( "su", "su", "-c", "/data/tracy_systrace", (char*)nullptr ); +#endif + execlp( "su", "su", "-c", "cat /sys/kernel/debug/tracing/trace_pipe", (char*)nullptr ); + exit( 1 ); + } + } + else if( pid > 0 ) + { + // parent + close( pipefd[1] ); + ProcessTraceLines( pipefd[0] ); + close( pipefd[0] ); + } + } +} +#else +static void ProcessTraceLines( int fd ) +{ + char* buf = (char*)tracy_malloc( 64*1024 ); + + struct pollfd pfd; + pfd.fd = fd; + pfd.events = POLLIN | POLLERR; + + for(;;) + { + while( poll( &pfd, 1, 0 ) <= 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + + const auto rd = read( fd, buf, 64*1024 ); + if( rd <= 0 ) break; + +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) continue; +#endif + + auto line = buf; + const auto end = buf + rd; + for(;;) + { + auto next = line; + while( next < end && *next != '\n' ) next++; + if( next == end ) break; + assert( *next == '\n' ); + next++; + + HandleTraceLine( line ); + line = next; + } + } + + tracy_free( buf ); +} + +void SysTraceWorker( void* ptr ) +{ + SetThreadName( "Tracy SysTrace" ); + char tmp[256]; + memcpy( tmp, BasePath, sizeof( BasePath ) - 1 ); + memcpy( tmp + sizeof( BasePath ) - 1, TracePipe, sizeof( TracePipe ) ); + + int fd = open( tmp, O_RDONLY ); + if( fd < 0 ) return; + ProcessTraceLines( fd ); + close( fd ); +} +#endif + +void SysTraceSendExternalName( uint64_t thread ) +{ + FILE* f; + char fn[256]; + sprintf( fn, "/proc/%" PRIu64 "/comm", thread ); + f = fopen( fn, "rb" ); + if( f ) + { + char buf[256]; + const auto sz = fread( buf, 1, 256, f ); + if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0'; + GetProfiler().SendString( thread, buf, QueueType::ExternalThreadName ); + fclose( f ); + } + else + { + GetProfiler().SendString( thread, "???", QueueType::ExternalThreadName ); + } + + sprintf( fn, "/proc/%" PRIu64 "/status", thread ); + f = fopen( fn, "rb" ); + if( f ) + { + int pid = -1; + size_t lsz = 1024; + auto line = (char*)malloc( lsz ); + for(;;) + { + auto rd = getline( &line, &lsz, f ); + if( rd <= 0 ) break; + if( memcmp( "Tgid:\t", line, 6 ) == 0 ) + { + pid = atoi( line + 6 ); + break; + } + } + free( line ); + fclose( f ); + if( pid >= 0 ) + { + { + uint64_t _pid = pid; + Magic magic; + auto token = GetToken(); + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + MemWrite( &item->hdr.type, QueueType::TidToPid ); + MemWrite( &item->tidToPid.tid, thread ); + MemWrite( &item->tidToPid.pid, _pid ); + tail.store( magic + 1, std::memory_order_release ); + } + sprintf( fn, "/proc/%i/comm", pid ); + f = fopen( fn, "rb" ); + if( f ) + { + char buf[256]; + const auto sz = fread( buf, 1, 256, f ); + if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0'; + GetProfiler().SendString( thread, buf, QueueType::ExternalName ); + fclose( f ); + return; + } + } + } + GetProfiler().SendString( thread, "???", QueueType::ExternalName ); +} + +} + +# endif + +#endif diff --git a/libs/tracy/client/TracySysTrace.hpp b/libs/tracy/client/TracySysTrace.hpp @@ -0,0 +1,25 @@ +#ifndef __TRACYSYSTRACE_HPP__ +#define __TRACYSYSTRACE_HPP__ + +#if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __CYGWIN__ || defined __linux__ ) +# define TRACY_HAS_SYSTEM_TRACING +#endif + +#ifdef TRACY_HAS_SYSTEM_TRACING + +#include <stdint.h> + +namespace tracy +{ + +bool SysTraceStart(); +void SysTraceStop(); +void SysTraceWorker( void* ptr ); + +void SysTraceSendExternalName( uint64_t thread ); + +} + +#endif + +#endif diff --git a/libs/tracy/client/TracySysTracePayload.hpp b/libs/tracy/client/TracySysTracePayload.hpp @@ -0,0 +1,80 @@ +// File: '/home/wolf/desktop/tracy_systrace.armv7' (1210 bytes)` +// File: '/home/wolf/desktop/tracy_systrace.aarch64' (1650 bytes) + +// Exported using binary_to_compressed_c.cpp + +namespace tracy +{ + +static const unsigned int tracy_systrace_armv7_size = 1210; +static const unsigned int tracy_systrace_armv7_data[1212/4] = +{ + 0x464c457f, 0x00010101, 0x00000000, 0x00000000, 0x00280003, 0x00000001, 0x00000208, 0x00000034, 0x00000000, 0x05000200, 0x00200034, 0x00280007, + 0x00000000, 0x00000006, 0x00000034, 0x00000034, 0x00000034, 0x000000e0, 0x000000e0, 0x00000004, 0x00000004, 0x00000003, 0x00000114, 0x00000114, + 0x00000114, 0x00000013, 0x00000013, 0x00000004, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x000003ed, 0x000003ed, 0x00000005, + 0x00001000, 0x00000001, 0x000003ed, 0x000013ed, 0x000013ed, 0x000000cd, 0x000000cf, 0x00000006, 0x00001000, 0x00000002, 0x000003f0, 0x000013f0, + 0x000013f0, 0x000000b8, 0x000000b8, 0x00000006, 0x00000004, 0x6474e551, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000006, + 0x00000010, 0x70000001, 0x00000394, 0x00000394, 0x00000394, 0x00000008, 0x00000008, 0x00000004, 0x00000004, 0x7379732f, 0x2f6d6574, 0x2f6e6962, + 0x6b6e696c, 0x00007265, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000012, 0x00000016, 0x00000000, + 0x00000000, 0x00000012, 0x6f6c6400, 0x006e6570, 0x4342494c, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x006d7973, 0x00000001, 0x00000003, 0x00000001, + 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000003, 0x00000002, 0x00000000, 0x00000000, 0x00000001, 0x00020000, 0x00000002, 0x00010001, + 0x0000000d, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000008, 0x00000000, 0x000014b4, 0x00000116, 0x000014b8, 0x00000216, 0xe52de004, + 0xe59fe004, 0xe08fe00e, 0xe5bef008, 0x000012bc, 0xe28fc600, 0xe28cca01, 0xe5bcf2bc, 0xe28fc600, 0xe28cca01, 0xe5bcf2b4, 0xe92d4ff0, 0xe28db01c, + 0xe24dd01c, 0xe24dd801, 0xe59f0154, 0xe3a01001, 0xe08f0000, 0xebfffff1, 0xe59f1148, 0xe1a07000, 0xe08f1001, 0xebfffff0, 0xe59f113c, 0xe1a09000, + 0xe1a00007, 0xe08f1001, 0xebffffeb, 0xe59f112c, 0xe1a04000, 0xe1a00007, 0xe08f1001, 0xebffffe6, 0xe59f111c, 0xe1a05000, 0xe1a00007, 0xe08f1001, + 0xebffffe1, 0xe59f110c, 0xe1a06000, 0xe1a00007, 0xe08f1001, 0xebffffdc, 0xe58d0004, 0xe1a00007, 0xe59f10f4, 0xe08f1001, 0xebffffd7, 0xe1a0a000, + 0xe59f00e8, 0xe3a01000, 0xe3a08000, 0xe08f0000, 0xe12fff39, 0xe1a07000, 0xe3700001, 0xca000001, 0xe3a00000, 0xe12fff34, 0xe3a00009, 0xe58d4000, + 0xe1cd01b4, 0xe3090680, 0xe3400098, 0xe28d4010, 0xe58d000c, 0xe28d9018, 0xe58d8008, 0xe28d8008, 0xe58d7010, 0xea000003, 0xe1a02000, 0xe3a00001, + 0xe1a01009, 0xe12fff3a, 0xe1a00004, 0xe3a01001, 0xe3a02000, 0xe12fff35, 0xe3500000, 0xca000008, 0xe1a00008, 0xe3a01000, 0xe12fff36, 0xe1a00004, + 0xe3a01001, 0xe3a02000, 0xe12fff35, 0xe3500001, 0xbafffff6, 0xe59d3004, 0xe1a00007, 0xe1a01009, 0xe3a02801, 0xe12fff33, 0xe3500001, 0xaaffffe5, + 0xe59d1000, 0xe3a00000, 0xe12fff31, 0xe24bd01c, 0xe8bd8ff0, 0x00000174, 0x0000016c, 0x0000015d, 0x0000014e, 0x0000013f, 0x00000135, 0x00000126, + 0x00000114, 0x7ffffe74, 0x00000001, 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, 0x6e006c6c, 0x736f6e61, 0x7065656c, 0x61657200, + 0x72770064, 0x00657469, 0x7379732f, 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, 0x72742f67, 0x5f656361, 0x65706970, 0x00000000, + 0x00000003, 0x000014a8, 0x00000002, 0x00000010, 0x00000017, 0x000001cc, 0x00000014, 0x00000011, 0x00000015, 0x00000000, 0x00000006, 0x00000128, + 0x0000000b, 0x00000010, 0x00000005, 0x00000158, 0x0000000a, 0x0000001c, 0x6ffffef5, 0x00000174, 0x00000004, 0x0000018c, 0x00000001, 0x0000000d, + 0x0000001e, 0x00000008, 0x6ffffffb, 0x00000001, 0x6ffffff0, 0x000001a4, 0x6ffffffe, 0x000001ac, 0x6fffffff, 0x00000001, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x000001dc, 0x000001dc, +}; + +static const unsigned int tracy_systrace_aarch64_size = 1650; +static const unsigned int tracy_systrace_aarch64_data[1652/4] = +{ + 0x464c457f, 0x00010102, 0x00000000, 0x00000000, 0x00b70003, 0x00000001, 0x00000300, 0x00000000, 0x00000040, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00380040, 0x00400006, 0x00000000, 0x00000006, 0x00000005, 0x00000040, 0x00000000, 0x00000040, 0x00000000, 0x00000040, 0x00000000, + 0x00000150, 0x00000000, 0x00000150, 0x00000000, 0x00000008, 0x00000000, 0x00000003, 0x00000004, 0x00000190, 0x00000000, 0x00000190, 0x00000000, + 0x00000190, 0x00000000, 0x00000015, 0x00000000, 0x00000015, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000005, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000004d1, 0x00000000, 0x000004d1, 0x00000000, 0x00010000, 0x00000000, 0x00000001, 0x00000006, + 0x000004d8, 0x00000000, 0x000104d8, 0x00000000, 0x000104d8, 0x00000000, 0x0000019a, 0x00000000, 0x000001a0, 0x00000000, 0x00010000, 0x00000000, + 0x00000002, 0x00000006, 0x000004d8, 0x00000000, 0x000104d8, 0x00000000, 0x000104d8, 0x00000000, 0x00000170, 0x00000000, 0x00000170, 0x00000000, + 0x00000008, 0x00000000, 0x6474e551, 0x00000006, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000010, 0x00000000, 0x7379732f, 0x2f6d6574, 0x2f6e6962, 0x6b6e696c, 0x34367265, 0x00000000, 0x00000001, 0x00000004, + 0x00000003, 0x00000000, 0x00000000, 0x00000000, 0x00000002, 0x00000000, 0x00000001, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000a0003, 0x00000300, 0x00000000, + 0x00000000, 0x00000000, 0x0000000a, 0x00000012, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000011, 0x00000012, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x6e65706f, 0x736c6400, 0x4c006d79, 0x00434249, 0x00000000, 0x00020002, 0x00000000, + 0x00010001, 0x00000001, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000017, 0x00000000, 0x00010668, 0x00000000, 0x00000402, 0x00000002, + 0x00000000, 0x00000000, 0x00010670, 0x00000000, 0x00000402, 0x00000003, 0x00000000, 0x00000000, 0xa9bf7bf0, 0x90000090, 0xf9433211, 0x91198210, + 0xd61f0220, 0xd503201f, 0xd503201f, 0xd503201f, 0x90000090, 0xf9433611, 0x9119a210, 0xd61f0220, 0x90000090, 0xf9433a11, 0x9119c210, 0xd61f0220, + 0xf81b0ffc, 0xa9015ff8, 0xa90257f6, 0xa9034ff4, 0xa9047bfd, 0x910103fd, 0xd14043ff, 0xd10043ff, 0x90000000, 0x91120000, 0x320003e1, 0x97ffffed, + 0x90000001, 0x91122021, 0xaa0003f7, 0x97ffffed, 0x90000001, 0xaa0003f8, 0x91123421, 0xaa1703e0, 0x97ffffe8, 0x90000001, 0xaa0003f3, 0x91124821, + 0xaa1703e0, 0x97ffffe3, 0x90000001, 0xaa0003f4, 0x91125c21, 0xaa1703e0, 0x97ffffde, 0x90000001, 0xaa0003f5, 0x91128421, 0xaa1703e0, 0x97ffffd9, + 0x90000001, 0xaa0003f6, 0x91129821, 0xaa1703e0, 0x97ffffd4, 0xaa0003f7, 0x90000000, 0x9112b000, 0x2a1f03e1, 0xd63f0300, 0x2a0003f8, 0x36f80060, + 0x2a1f03e0, 0xd63f0260, 0x90000008, 0x3dc11d00, 0x52800128, 0xb81c83b8, 0x781cc3a8, 0x3d8003e0, 0x14000005, 0x93407c02, 0x320003e0, 0x910043e1, + 0xd63f02e0, 0xd100e3a0, 0x320003e1, 0x2a1f03e2, 0xd63f0280, 0x7100001f, 0x5400014c, 0x910003e0, 0xaa1f03e1, 0xd63f02a0, 0xd100e3a0, 0x320003e1, + 0x2a1f03e2, 0xd63f0280, 0x7100041f, 0x54ffff0b, 0x910043e1, 0x321003e2, 0x2a1803e0, 0xd63f02c0, 0x7100041f, 0x54fffd0a, 0x2a1f03e0, 0xd63f0260, + 0x914043ff, 0x910043ff, 0xa9447bfd, 0xa9434ff4, 0xa94257f6, 0xa9415ff8, 0xf84507fc, 0xd65f03c0, 0x00000000, 0x00000000, 0x00989680, 0x00000000, + 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, 0x6e006c6c, 0x736f6e61, 0x7065656c, 0x61657200, 0x72770064, 0x00657469, 0x7379732f, + 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, 0x72742f67, 0x5f656361, 0x65706970, 0x00000000, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000004, 0x00000000, 0x000001a8, 0x00000000, 0x6ffffef5, 0x00000000, 0x000001c8, 0x00000000, 0x00000005, 0x00000000, + 0x00000248, 0x00000000, 0x00000006, 0x00000000, 0x000001e8, 0x00000000, 0x0000000a, 0x00000000, 0x0000001c, 0x00000000, 0x0000000b, 0x00000000, + 0x00000018, 0x00000000, 0x00000015, 0x00000000, 0x00000000, 0x00000000, 0x00000003, 0x00000000, 0x00010650, 0x00000000, 0x00000002, 0x00000000, + 0x00000030, 0x00000000, 0x00000014, 0x00000000, 0x00000007, 0x00000000, 0x00000017, 0x00000000, 0x00000290, 0x00000000, 0x0000001e, 0x00000000, + 0x00000008, 0x00000000, 0x6ffffffb, 0x00000000, 0x00000001, 0x00000000, 0x6ffffffe, 0x00000000, 0x00000270, 0x00000000, 0x6fffffff, 0x00000000, + 0x00000001, 0x00000000, 0x6ffffff0, 0x00000000, 0x00000264, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000104d8, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x000002c0, 0x00000000, 0x000002c0, +}; + +} diff --git a/libs/tracy/client/TracyThread.hpp b/libs/tracy/client/TracyThread.hpp @@ -0,0 +1,70 @@ +#ifndef __TRACYTHREAD_HPP__ +#define __TRACYTHREAD_HPP__ + +#if defined _WIN32 || defined __CYGWIN__ +# include <windows.h> +#else +# include <pthread.h> +#endif + +namespace tracy +{ + +#if defined _WIN32 || defined __CYGWIN__ + +class Thread +{ +public: + Thread( void(*func)( void* ptr ), void* ptr ) + : m_func( func ) + , m_ptr( ptr ) + , m_hnd( CreateThread( nullptr, 0, Launch, this, 0, nullptr ) ) + {} + + ~Thread() + { + WaitForSingleObject( m_hnd, INFINITE ); + CloseHandle( m_hnd ); + } + + HANDLE Handle() const { return m_hnd; } + +private: + static DWORD WINAPI Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return 0; } + + void(*m_func)( void* ptr ); + void* m_ptr; + HANDLE m_hnd; +}; + +#else + +class Thread +{ +public: + Thread( void(*func)( void* ptr ), void* ptr ) + : m_func( func ) + , m_ptr( ptr ) + { + pthread_create( &m_thread, nullptr, Launch, this ); + } + + ~Thread() + { + pthread_join( m_thread, nullptr ); + } + + pthread_t Handle() const { return m_thread; } + +private: + static void* Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return nullptr; } + void(*m_func)( void* ptr ); + void* m_ptr; + pthread_t m_thread; +}; + +#endif + +} + +#endif diff --git a/libs/tracy/client/tracy_concurrentqueue.h b/libs/tracy/client/tracy_concurrentqueue.h @@ -0,0 +1,1552 @@ +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue. +// An overview, including benchmark results, is provided here: +// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ +// The full design is also described in excruciating detail at: +// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue + +// Simplified BSD license: +// Copyright (c) 2013-2016, Cameron Desrochers. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this list of +// conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this list of +// conditions and the following disclaimer in the documentation and/or other materials +// provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#pragma once + +#include "../common/TracyAlloc.hpp" +#include "../common/TracyForceInline.hpp" +#include "../common/TracySystem.hpp" + +#if defined(__GNUC__) +// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and +// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings +// upon assigning any computed values) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#if defined(__APPLE__) +#include "TargetConditionals.h" +#endif + +#include <atomic> // Requires C++11. Sorry VS2010. +#include <cassert> +#include <cstddef> // for max_align_t +#include <cstdint> +#include <cstdlib> +#include <type_traits> +#include <algorithm> +#include <utility> +#include <limits> +#include <climits> // for CHAR_BIT +#include <array> +#include <thread> // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading + +namespace tracy +{ + +// Exceptions +#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) +#define MOODYCAMEL_EXCEPTIONS_ENABLED +#endif +#endif +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw (expr) +#else +#define MOODYCAMEL_TRY if (true) +#define MOODYCAMEL_CATCH(...) else if (false) +#define MOODYCAMEL_RETHROW +#define MOODYCAMEL_THROW(expr) +#endif + +#ifndef MOODYCAMEL_NOEXCEPT +#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) +#define MOODYCAMEL_NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 +// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( +// We have to assume *all* non-trivial constructors may throw on VS2012! +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#else +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) +#endif +#endif + +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. +#ifndef MOODYCAMEL_DELETE_FUNCTION +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define MOODYCAMEL_DELETE_FUNCTION +#else +#define MOODYCAMEL_DELETE_FUNCTION = delete +#endif +#endif + +// Compiler-specific likely/unlikely hints +namespace moodycamel { namespace details { +#if defined(__GNUC__) + inline bool cqLikely(bool x) { return __builtin_expect((x), true); } + inline bool cqUnlikely(bool x) { return __builtin_expect((x), false); } +#else + inline bool cqLikely(bool x) { return x; } + inline bool cqUnlikely(bool x) { return x; } +#endif +} } + +namespace +{ + // to avoid MSVC warning 4127: conditional expression is constant + template <bool> + struct compile_time_condition + { + static const bool value = false; + }; + template <> + struct compile_time_condition<true> + { + static const bool value = true; + }; +} + +namespace moodycamel { +namespace details { + template<typename T> + struct const_numeric_max { + static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers"); + static const T value = std::numeric_limits<T>::is_signed + ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1) + : static_cast<T>(-1); + }; + +#if defined(__GLIBCXX__) + typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while +#else + typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: +#endif + + // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting + // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. + typedef union { + std_max_align_t x; + long long y; + void* z; + } max_align_t; +} + +// Default traits for the ConcurrentQueue. To change some of the +// traits without re-implementing all of them, inherit from this +// struct and shadow the declarations you wish to be different; +// since the traits are used as a template type parameter, the +// shadowed declarations will be used where defined, and the defaults +// otherwise. +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic<std::uint64_t> is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 128; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per element. + // For large block sizes, this is too inefficient, and switching to an atomic + // counter-based approach is faster. The switch is made for block sizes strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a token) + // must consume before it causes all consumers to rotate and move on to the next + // internal queue. + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. Note + // that this limit is enforced at the block level (for performance reasons), i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value; + + + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like std::malloc. +#if defined(malloc) || defined(free) + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void* ptr) { return free(ptr); } + static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } +#else + static inline void* malloc(size_t size) { return tracy::tracy_malloc(size); } + static inline void free(void* ptr) { return tracy::tracy_free(ptr); } +#endif +}; + + +// When producing or consuming many elements, the most efficient way is to: +// 1) Use one of the bulk-operation methods of the queue with a token +// 2) Failing that, use the bulk-operation methods without a token +// 3) Failing that, create a token and use that with the single-item methods +// 4) Failing that, use the single-parameter methods of the queue +// Having said that, don't create tokens willy-nilly -- ideally there should be +// a maximum of one token per thread (of each kind). +struct ProducerToken; +struct ConsumerToken; + +template<typename T, typename Traits> class ConcurrentQueue; + + +namespace details +{ + struct ConcurrentQueueProducerTypelessBase + { + ConcurrentQueueProducerTypelessBase* next; + std::atomic<bool> inactive; + ProducerToken* token; + uint64_t threadId; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr), threadId(0) + { + } + }; + + template<typename T> + static inline bool circular_less_than(T a, T b) + { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4554) +#endif + static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); + return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << static_cast<T>(sizeof(T) * CHAR_BIT - 1)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + } + + template<typename U> + static inline char* align_for(char* ptr) + { + const std::size_t alignment = std::alignment_of<U>::value; + return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment; + } + + template<typename T> + static inline T ceil_to_pow_2(T x) + { + static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; + } + + template<typename T> + static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right) + { + T temp = std::move(left.load(std::memory_order_relaxed)); + left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed); + right.store(std::move(temp), std::memory_order_relaxed); + } + + template<typename T> + static inline T const& nomove(T const& x) + { + return x; + } + + template<bool Enable> + struct nomove_if + { + template<typename T> + static inline T const& eval(T const& x) + { + return x; + } + }; + + template<> + struct nomove_if<false> + { + template<typename U> + static inline auto eval(U&& x) + -> decltype(std::forward<U>(x)) + { + return std::forward<U>(x); + } + }; + + template<typename It> + static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) + { + return *it; + } + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { }; +#else + template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { }; +#endif + + template<typename T> struct static_is_lock_free_num { enum { value = 0 }; }; + template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; + template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; }; + template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; + template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> { }; + template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; + template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; +} + + +struct ProducerToken +{ + template<typename T, typename Traits> + explicit ProducerToken(ConcurrentQueue<T, Traits>& queue); + + ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + + ~ProducerToken() + { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template<typename T, typename Traits> friend class ConcurrentQueue; + +protected: + details::ConcurrentQueueProducerTypelessBase* producer; +}; + + +struct ConsumerToken +{ + template<typename T, typename Traits> + explicit ConsumerToken(ConcurrentQueue<T, Traits>& q); + + ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template<typename T, typename Traits> friend class ConcurrentQueue; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase* currentProducer; + details::ConcurrentQueueProducerTypelessBase* desiredProducer; +}; + + +template<typename T, typename Traits = ConcurrentQueueDefaultTraits> +class ConcurrentQueue +{ +public: + struct ExplicitProducer; + + typedef moodycamel::ProducerToken producer_token_t; + typedef moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) +#pragma warning(disable: 4309) // static_cast: Truncation of constant value +#endif + static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers); + populate_initial_block_list(blocks); + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + tracy_force_inline T* enqueue_begin(producer_token_t const& token, index_t& currentTailIndex) + { + return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::enqueue_begin(currentTailIndex); + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template<typename It> + size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast<ProducerBase*>(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued); + } + if (dequeued == max) { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return count; + } + + template<typename It> + size_t try_dequeue_bulk_single(consumer_token_t& token, It itemFirst, size_t max, uint64_t& threadId ) + { + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + threadId = token.currentProducer->threadId; + return max; + } + token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count); + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + if( count == 0 ) + { + while (ptr != static_cast<ProducerBase*>(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + if (dequeued != 0) { + threadId = ptr->threadId; + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued); + return dequeued; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return 0; + } + else + { + threadId = token.currentProducer->threadId; + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 0; + return count; + } + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + size += ptr->size_approx(); + } + return size; + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return + details::static_is_lock_free<bool>::value == 2 && + details::static_is_lock_free<size_t>::value == 2 && + details::static_is_lock_free<std::uint32_t>::value == 2 && + details::static_is_lock_free<index_t>::value == 2 && + details::static_is_lock_free<void*>::value == 2; + } + + +private: + friend struct ProducerToken; + friend struct ConsumerToken; + friend struct ExplicitProducer; + + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + inline bool update_current_producer_after_rotation(consumer_token_t& token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if (details::cqUnlikely(token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- subtract from count first + std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + + /////////////////////////// + // Free list + /////////////////////////// + + template <typename N> + struct FreeListNode + { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } + + std::atomic<std::uint32_t> freeListRefs; + std::atomic<N*> freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly + // speedy under low contention. + template<typename N> // N must inherit FreeListNode or have the same fields (and initialization of them) + struct FreeList + { + FreeList() : freeListHead(nullptr) { } + FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } + void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } + + FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N* node) + { + // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N* try_get() + { + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which means we can read the + // next and not worry about it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) + N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } + + private: + inline void add_knowing_refcount_is_zero(N* node) + { + // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single thread case), then we know + // we can safely change the next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy contention, when the refcount + // goes to zero in between a load and a refcount increment of a node in try_get, then back up to + // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount and leave the add operation to + // the next thread who puts the refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount goes back to zero + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) + std::atomic<N*> freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + }; + + + /////////////////////////// + // Block + /////////////////////////// + + struct Block + { + Block() + : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true) + { + } + + inline bool is_empty() const + { + if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load(std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else { + // Check counter + if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit context) + inline bool set_empty(index_t i) + { + if (BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release); + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit context). + inline bool set_many_empty(index_t i, size_t count) + { + if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1; + for (size_t j = 0; j != count; ++j) { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + inline void set_all_empty() + { + if (BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); + } + } + + inline void reset_empty() + { + if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); } + inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); } + + private: + // IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of + // addresses returned by malloc, that alignment will be preserved. Apparently clang actually + // generates code that uses this assumption for AVX instructions in some cases. Ideally, we + // should also align Block to the alignment of T in case it's higher than malloc's 16-byte + // alignment, but this is hard to do in a cross-platform way. Assert for this case: + static_assert(std::alignment_of<T>::value <= std::alignment_of<details::max_align_t>::value, "The queue does not support super-aligned types at this time"); + // Additionally, we need the alignment of Block itself to be a multiple of max_align_t since + // otherwise the appropriate padding will not be added at the end of Block in order to make + // arrays of Blocks all be properly aligned (not just the first one). We use a union to force + // this. + union { + char elements[sizeof(T) * BLOCK_SIZE]; + details::max_align_t dummy; + }; + public: + Block* next; + std::atomic<size_t> elementsCompletelyDequeued; + std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; + public: + std::atomic<std::uint32_t> freeListRefs; + std::atomic<Block*> freeListNext; + std::atomic<bool> shouldBeOnFreeList; + bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' + }; + static_assert(std::alignment_of<Block>::value >= std::alignment_of<details::max_align_t>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue* parent_) : + tailIndex(0), + headIndex(0), + dequeueOptimisticCount(0), + dequeueOvercommit(0), + tailBlock(nullptr), + parent(parent_) + { + } + + virtual ~ProducerBase() { }; + + template<typename It> + inline size_t dequeue_bulk(It& itemFirst, size_t max) + { + return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max); + } + + inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0; + } + + inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } + protected: + std::atomic<index_t> tailIndex; // Where to enqueue to next + std::atomic<index_t> headIndex; // Where to dequeue from next + + std::atomic<index_t> dequeueOptimisticCount; + std::atomic<index_t> dequeueOvercommit; + + Block* tailBlock; + + public: + ConcurrentQueue* parent; + }; + + + public: + /////////////////////////// + // Explicit queue + /////////////////////////// + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue* _parent) : + ProducerBase(_parent), + blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), + pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = details::ceil_to_pow_2(_parent->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block* halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) { + // The head's not on a block boundary, meaning a block somewhere is partially dequeued + // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); + while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::is_empty()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index + auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) { + destroy(block); + } + else { + this->parent->add_block_to_free_list(block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast<BlockIndexHeader*>(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + inline void enqueue_begin_alloc(index_t currentTailIndex) + { + // We reached the end of a block, start a new one + if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::is_empty()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block::reset_empty(); + + // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the + // last block from it first -- except instead of removing then adding, we can just overwrite). + // Note that there must be a valid block index here, since even if allocation failed in the ctor, + // it would have been re-attempted when adding the first block to the queue; since there is such + // a block, a block index must have been successfully allocated. + } + else { + // We're going to need a new block; check that the block index has room + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if + // the initial allocation failed in the constructor. + new_block_index(pr_blockIndexSlotsUsed); + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::requisition_block(); + newBlock->ConcurrentQueue::Block::reset_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + // Add block to block index + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + tracy_force_inline T* enqueue_begin(index_t& currentTailIndex) + { + currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + if (details::cqUnlikely((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)) { + this->enqueue_begin_alloc(currentTailIndex); + } + return (*this->tailBlock)[currentTailIndex]; + } + + tracy_force_inline std::atomic<index_t>& get_tail_index() + { + return this->tailIndex; + } + + template<typename It> + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than<size_t>(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + assert(overcommit <= myDequeueCount); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than<size_t>(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1); + auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE); + endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + + const auto sz = endIndex - index; + memcpy( itemFirst, (*block)[index], sizeof( T ) * sz ); + index += sz; + itemFirst += sz; + + block->ConcurrentQueue::Block::set_many_empty(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block* block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic<size_t> front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry* entries; + void* prev; + }; + + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic<BlockIndexHeader*> blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry* pr_blockIndexEntries; + void* pr_blockIndexRaw; + }; + + ExplicitProducer* get_explicit_producer(producer_token_t const& token) + { + return static_cast<ExplicitProducer*>(token.producer); + } + + private: + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array<Block>(blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block* try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { + return nullptr; + } + + auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; + } + + inline void add_block_to_free_list(Block* block) + { + freeList.add(block); + } + + inline void add_blocks_to_free_list(Block* block) + { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block* try_get_block_from_free_list() + { + return freeList.try_get(); + } + + // Gets a free block from one of the memory pools, or allocates a new one (if applicable) + Block* requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) { + return block; + } + + return create<Block>(); + } + + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase* recycle_or_create_producer() + { + bool recycled; + return recycle_or_create_producer(recycled); + } + + ProducerBase* recycle_or_create_producer(bool& recycled) + { + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->inactive.load(std::memory_order_relaxed)) { + if( ptr->size_approx() == 0 ) + { + bool expected = true; + if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have it + recycled = true; + return ptr; + } + } + } + } + + recycled = false; + return add_producer(static_cast<ProducerBase*>(create<ExplicitProducer>(this))); + } + + ProducerBase* add_producer(ProducerBase* producer) + { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { + ptr->parent = this; + } + } + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template<typename U> + static inline U* create_array(size_t count) + { + assert(count > 0); + return static_cast<U*>((Traits::malloc)(sizeof(U) * count)); + } + + template<typename U> + static inline void destroy_array(U* p, size_t count) + { + ((void)count); + if (p != nullptr) { + assert(count > 0); + (Traits::free)(p); + } + } + + template<typename U> + static inline U* create() + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template<typename U, typename A1> + static inline U* create(A1&& a1) + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr; + } + + template<typename U> + static inline void destroy(U* p) + { + if (p != nullptr) { + p->~U(); + } + (Traits::free)(p); + } + +private: + std::atomic<ProducerBase*> producerListTail; + std::atomic<std::uint32_t> producerCount; + + std::atomic<size_t> initialBlockPoolIndex; + Block* initialBlockPool; + size_t initialBlockPoolSize; + + FreeList<Block> freeList; + + std::atomic<std::uint32_t> nextExplicitConsumerId; + std::atomic<std::uint32_t> globalExplicitConsumerOffset; +}; + + +template<typename T, typename Traits> +ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue) + : producer(queue.recycle_or_create_producer()) +{ + if (producer != nullptr) { + producer->token = this; + producer->threadId = detail::GetThreadHandleImpl(); + } +} + +template<typename T, typename Traits> +ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast<std::uint32_t>(-1); +} + +template<typename T, typename Traits> +inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} + +} /* namespace tracy */ + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif diff --git a/libs/tracy/client/tracy_rpmalloc.cpp b/libs/tracy/client/tracy_rpmalloc.cpp @@ -0,0 +1,2099 @@ +#ifdef TRACY_ENABLE + +/* rpmalloc.c - Memory allocator - Public Domain - 2016 Mattias Jansson / Rampant Pixels + * + * This library provides a cross-platform lock free thread caching malloc implementation in C11. + * The latest source code is always available at + * + * https://github.com/rampantpixels/rpmalloc + * + * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. + * + */ + +#include "tracy_rpmalloc.hpp" + +/// Build time configurable limits +#ifndef HEAP_ARRAY_SIZE +//! Size of heap hashmap +#define HEAP_ARRAY_SIZE 79 +#endif +#ifndef ENABLE_THREAD_CACHE +//! Enable per-thread cache +#define ENABLE_THREAD_CACHE 1 +#endif +#ifndef ENABLE_GLOBAL_CACHE +//! Enable global cache shared between all threads, requires thread cache +#define ENABLE_GLOBAL_CACHE 1 +#endif +#ifndef ENABLE_VALIDATE_ARGS +//! Enable validation of args to public entry points +#define ENABLE_VALIDATE_ARGS 0 +#endif +#ifndef ENABLE_STATISTICS +//! Enable statistics collection +#define ENABLE_STATISTICS 0 +#endif +#ifndef ENABLE_ASSERTS +//! Enable asserts +#define ENABLE_ASSERTS 0 +#endif +#ifndef ENABLE_PRELOAD +//! Support preloading +#define ENABLE_PRELOAD 0 +#endif +#ifndef ENABLE_GUARDS +//! Enable overwrite/underwrite guards +#define ENABLE_GUARDS 0 +#endif +#ifndef ENABLE_UNLIMITED_CACHE +//! Unlimited cache disables any cache limitations +#define ENABLE_UNLIMITED_CACHE 0 +#endif +#ifndef DEFAULT_SPAN_MAP_COUNT +//! Default number of spans to map in call to map more virtual memory +#define DEFAULT_SPAN_MAP_COUNT 16 +#endif +//! Minimum cache size to remain after a release to global cache +#define MIN_SPAN_CACHE_SIZE 64 +//! Minimum number of spans to transfer between thread and global cache +#define MIN_SPAN_CACHE_RELEASE 16 +//! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor) +#define MAX_SPAN_CACHE_DIVISOR 4 +//! Minimum cache size to remain after a release to global cache, large spans +#define MIN_LARGE_SPAN_CACHE_SIZE 8 +//! Minimum number of spans to transfer between thread and global cache, large spans +#define MIN_LARGE_SPAN_CACHE_RELEASE 4 +//! Maximum cache size divisor, large spans (max cache size will be max allocation count divided by this divisor) +#define MAX_LARGE_SPAN_CACHE_DIVISOR 16 +//! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this) +#define MAX_GLOBAL_CACHE_MULTIPLIER 8 + +#if !ENABLE_THREAD_CACHE +# undef ENABLE_GLOBAL_CACHE +# define ENABLE_GLOBAL_CACHE 0 +# undef MIN_SPAN_CACHE_SIZE +# undef MIN_SPAN_CACHE_RELEASE +# undef MAX_SPAN_CACHE_DIVISOR +# undef MIN_LARGE_SPAN_CACHE_SIZE +# undef MIN_LARGE_SPAN_CACHE_RELEASE +# undef MAX_LARGE_SPAN_CACHE_DIVISOR +#endif +#if !ENABLE_GLOBAL_CACHE +# undef MAX_GLOBAL_CACHE_MULTIPLIER +#endif + +/// Platform and arch specifics +#ifdef _MSC_VER +# pragma warning( push ) +# pragma warning( disable : 4324 ) +# define ALIGNED_STRUCT(name, alignment) __declspec(align(alignment)) struct name +# define FORCEINLINE __forceinline +# define atomic_thread_fence_acquire() //_ReadWriteBarrier() +# define atomic_thread_fence_release() //_ReadWriteBarrier() +# if ENABLE_VALIDATE_ARGS +# include <Intsafe.h> +# endif +# include <intrin.h> +#else +# include <unistd.h> +# if defined(__APPLE__) && ENABLE_PRELOAD +# include <pthread.h> +# endif +# define ALIGNED_STRUCT(name, alignment) struct __attribute__((__aligned__(alignment))) name +# ifdef FORCEINLINE +# undef FORCEINLINE +# endif +# define FORCEINLINE inline __attribute__((__always_inline__)) +# ifdef __arm__ +# define atomic_thread_fence_acquire() __asm volatile("dmb ish" ::: "memory") +# define atomic_thread_fence_release() __asm volatile("dmb ishst" ::: "memory") +# else +# define atomic_thread_fence_acquire() //__asm volatile("" ::: "memory") +# define atomic_thread_fence_release() //__asm volatile("" ::: "memory") +# endif +#endif + +#if defined( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( _AMD64_ ) || defined( __arm64__ ) || defined( __aarch64__ ) +# define ARCH_64BIT 1 +#else +# define ARCH_64BIT 0 +#endif + +#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) +# define PLATFORM_WINDOWS 1 +# define PLATFORM_POSIX 0 +#else +# define PLATFORM_WINDOWS 0 +# define PLATFORM_POSIX 1 +#endif + +#include <stdint.h> +#include <string.h> + +#include <assert.h> + +#if ENABLE_GUARDS +# define MAGIC_GUARD 0xDEADBAAD +#endif + +namespace tracy +{ + +/// Atomic access abstraction +ALIGNED_STRUCT(atomic32_t, 4) { + volatile int32_t nonatomic; +}; +typedef struct atomic32_t atomic32_t; + +ALIGNED_STRUCT(atomic64_t, 8) { + volatile int64_t nonatomic; +}; +typedef struct atomic64_t atomic64_t; + +ALIGNED_STRUCT(atomicptr_t, 8) { + volatile void* nonatomic; +}; +typedef struct atomicptr_t atomicptr_t; + +static FORCEINLINE int32_t +atomic_load32(atomic32_t* src) { + return src->nonatomic; +} + +static FORCEINLINE void +atomic_store32(atomic32_t* dst, int32_t val) { + dst->nonatomic = val; +} + +static FORCEINLINE int32_t +atomic_incr32(atomic32_t* val) { +#ifdef _MSC_VER + int32_t old = (int32_t)_InterlockedExchangeAdd((volatile long*)&val->nonatomic, 1); + return (old + 1); +#else + return __sync_add_and_fetch(&val->nonatomic, 1); +#endif +} + +static FORCEINLINE int32_t +atomic_add32(atomic32_t* val, int32_t add) { +#ifdef _MSC_VER + int32_t old = (int32_t)_InterlockedExchangeAdd((volatile long*)&val->nonatomic, add); + return (old + add); +#else + return __sync_add_and_fetch(&val->nonatomic, add); +#endif +} + +static FORCEINLINE void* +atomic_load_ptr(atomicptr_t* src) { + return (void*)((uintptr_t)src->nonatomic); +} + +static FORCEINLINE void +atomic_store_ptr(atomicptr_t* dst, void* val) { + dst->nonatomic = val; +} + +static FORCEINLINE int +atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { +#ifdef _MSC_VER +# if ARCH_64BIT + return (_InterlockedCompareExchange64((volatile long long*)&dst->nonatomic, + (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; +# else + return (_InterlockedCompareExchange((volatile long*)&dst->nonatomic, + (long)val, (long)ref) == (long)ref) ? 1 : 0; +# endif +#else + return __sync_bool_compare_and_swap(&dst->nonatomic, ref, val); +#endif +} + +/// Preconfigured limits and sizes +//! Granularity of a small allocation block +#define SMALL_GRANULARITY 32 +//! Small granularity shift count +#define SMALL_GRANULARITY_SHIFT 5 +//! Number of small block size classes +#define SMALL_CLASS_COUNT 63 +//! Maximum size of a small block +#define SMALL_SIZE_LIMIT 2016 +//! Granularity of a medium allocation block +#define MEDIUM_GRANULARITY 512 +//! Medium granularity shift count +#define MEDIUM_GRANULARITY_SHIFT 9 +//! Number of medium block size classes +#define MEDIUM_CLASS_COUNT 60 +//! Total number of small + medium size classes +#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) +//! Number of large block size classes +#define LARGE_CLASS_COUNT 32 +//! Maximum size of a medium block +#define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT) - SPAN_HEADER_SIZE) +//! Maximum size of a large block +#define LARGE_SIZE_LIMIT ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) +//! Size of a span header +#define SPAN_HEADER_SIZE 32 + +#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs)) +#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second)) + +#if ARCH_64BIT +typedef int64_t offset_t; +#else +typedef int32_t offset_t; +#endif +typedef uint32_t count_t; + +#if ENABLE_VALIDATE_ARGS +//! Maximum allocation size to avoid integer overflow +#undef MAX_ALLOC_SIZE +#define MAX_ALLOC_SIZE (((size_t)-1) - _memory_span_size) +#endif + +/// Data types +//! A memory heap, per thread +typedef struct heap_t heap_t; +//! Span of memory pages +typedef struct span_t span_t; +//! Size class definition +typedef struct size_class_t size_class_t; +//! Span block bookkeeping +typedef struct span_block_t span_block_t; +//! Span list bookkeeping +typedef struct span_list_t span_list_t; +//! Span data union, usage depending on span state +typedef union span_data_t span_data_t; +//! Cache data +typedef struct span_counter_t span_counter_t; +//! Global cache +typedef struct global_cache_t global_cache_t; + +//! Flag indicating span is the first (master) span of a split superspan +#define SPAN_FLAG_MASTER 1 +//! Flag indicating span is a secondary (sub) span of a split superspan +#define SPAN_FLAG_SUBSPAN 2 + +//Alignment offset must match in both structures to keep the data when +//transitioning between being used for blocks and being part of a list +struct span_block_t { + //! Free list + uint16_t free_list; + //! First autolinked block + uint16_t first_autolink; + //! Free count + uint16_t free_count; + //! Alignment offset + uint16_t align_offset; +}; + +struct span_list_t { + //! List size + uint32_t size; + //! Unused in lists + uint16_t unused; + //! Alignment offset + uint16_t align_offset; +}; + +union span_data_t { + //! Span data when used as blocks + span_block_t block; + //! Span data when used in lists + span_list_t list; +}; + +//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable, +//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single +//span or a super span. A super span can further be diviced into multiple spans (or this, super spans), where the first +//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans +//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire +//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released +//in the same call to release the virtual memory range, but individual subranges can be decommitted individually +//to reduce physical memory use). +struct span_t { + //! Heap ID + atomic32_t heap_id; + //! Size class + uint16_t size_class; + // TODO: If we could store remainder part of flags as an atomic counter, the entire check + // if master is owned by calling heap could be simplified to an atomic dec from any thread + // since remainder of a split super span only ever decreases, never increases + //! Flags and counters + uint16_t flags; + //! Span data + span_data_t data; + //! Next span + span_t* next_span; + //! Previous span + span_t* prev_span; +}; +static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); + +//Adaptive cache counter of a single superspan span count +struct span_counter_t { + //! Allocation high water mark + uint32_t max_allocations; + //! Current number of allocations + uint32_t current_allocations; + //! Cache limit + uint32_t cache_limit; +}; + +struct heap_t { + //! Heap ID + int32_t id; + //! Free count for each size class active span + span_block_t active_block[SIZE_CLASS_COUNT]; + //! Active span for each size class + span_t* active_span[SIZE_CLASS_COUNT]; + //! List of semi-used spans with free blocks for each size class (double linked list) + span_t* size_cache[SIZE_CLASS_COUNT]; +#if ENABLE_THREAD_CACHE + //! List of free spans (single linked list) + span_t* span_cache[LARGE_CLASS_COUNT]; + //! Allocation counters + span_counter_t span_counter[LARGE_CLASS_COUNT]; +#endif + //! Mapped but unused spans + span_t* span_reserve; + //! Master span for mapped but unused spans + span_t* span_reserve_master; + //! Number of mapped but unused spans + size_t spans_reserved; + //! Deferred deallocation + atomicptr_t defer_deallocate; + //! Deferred unmaps + atomicptr_t defer_unmap; + //! Next heap in id list + heap_t* next_heap; + //! Next heap in orphan list + heap_t* next_orphan; + //! Memory pages alignment offset + size_t align_offset; +#if ENABLE_STATISTICS + //! Number of bytes transitioned thread -> global + size_t thread_to_global; + //! Number of bytes transitioned global -> thread + size_t global_to_thread; +#endif +}; + +struct size_class_t { + //! Size of blocks in this class + uint32_t size; + //! Number of blocks in each chunk + uint16_t block_count; + //! Class index this class is merged with + uint16_t class_idx; +}; +static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); + +struct global_cache_t { + //! Cache list pointer + atomicptr_t cache; + //! Cache size + atomic32_t size; + //! ABA counter + atomic32_t counter; +}; + +/// Global data +//! Configuration +static rpmalloc_config_t _memory_config; +//! Memory page size +static size_t _memory_page_size; +//! Shift to divide by page size +static size_t _memory_page_size_shift; +//! Mask to get to start of a memory page +static size_t _memory_page_mask; +//! Granularity at which memory pages are mapped by OS +static size_t _memory_map_granularity; +//! Size of a span of memory pages +static size_t _memory_span_size; +//! Shift to divide by span size +static size_t _memory_span_size_shift; +//! Mask to get to start of a memory span +static uintptr_t _memory_span_mask; +//! Global size classes +static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; +//! Run-time size limit of medium blocks +static size_t _memory_medium_size_limit; +//! Heap ID counter +static atomic32_t _memory_heap_id; +#if ENABLE_THREAD_CACHE +//! Adaptive cache max allocation count +static uint32_t _memory_max_allocation[LARGE_CLASS_COUNT]; +#endif +#if ENABLE_GLOBAL_CACHE +//! Global span cache +static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; +#endif +//! All heaps +static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE]; +//! Orphaned heaps +static atomicptr_t _memory_orphan_heaps; +//! Running orphan counter to avoid ABA issues in linked list +static atomic32_t _memory_orphan_counter; +//! Active heap count +static atomic32_t _memory_active_heaps; +#if ENABLE_STATISTICS +//! Total number of currently mapped memory pages +static atomic32_t _mapped_pages; +//! Total number of currently lost spans +static atomic32_t _reserved_spans; +//! Running counter of total number of mapped memory pages since start +static atomic32_t _mapped_total; +//! Running counter of total number of unmapped memory pages since start +static atomic32_t _unmapped_total; +#endif + +#define MEMORY_UNUSED(x) (void)sizeof((x)) + +//! Current thread heap +#if defined(__APPLE__) && ENABLE_PRELOAD +static pthread_key_t _memory_thread_heap; +#else +# ifdef _MSC_VER +# define _Thread_local __declspec(thread) +# define TLS_MODEL +# else +# define TLS_MODEL __attribute__((tls_model("initial-exec"))) +# if !defined(__clang__) && defined(__GNUC__) +# define _Thread_local __thread +# endif +# endif +static _Thread_local heap_t* _memory_thread_heap TLS_MODEL; +#endif + +//! Get the current thread heap +static FORCEINLINE heap_t* +get_thread_heap(void) { +#if defined(__APPLE__) && ENABLE_PRELOAD + return pthread_getspecific(_memory_thread_heap); +#else + return _memory_thread_heap; +#endif +} + +//! Set the current thread heap +static void +set_thread_heap(heap_t* heap) { +#if defined(__APPLE__) && ENABLE_PRELOAD + pthread_setspecific(_memory_thread_heap, heap); +#else + _memory_thread_heap = heap; +#endif +} + +//! Default implementation to map more virtual memory +static void* +_memory_map_os(size_t size, size_t* offset); + +//! Default implementation to unmap virtual memory +static void +_memory_unmap_os(void* address, size_t size, size_t offset, int release); + +//! Deallocate any deferred blocks and check for the given size class +static int +_memory_deallocate_deferred(heap_t* heap, size_t size_class); + +//! Lookup a memory heap from heap ID +static heap_t* +_memory_heap_lookup(int32_t id) { + uint32_t list_idx = id % HEAP_ARRAY_SIZE; + heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + while (heap && (heap->id != id)) + heap = heap->next_heap; + return heap; +} + +#if ENABLE_THREAD_CACHE + +//! Increase an allocation counter +static void +_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size_t span_count) { + if (++counter->current_allocations > counter->max_allocations) { + counter->max_allocations = counter->current_allocations; + const uint32_t cache_limit_max = (uint32_t)_memory_span_size - 2; +#if !ENABLE_UNLIMITED_CACHE + counter->cache_limit = counter->max_allocations / ((span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : MAX_LARGE_SPAN_CACHE_DIVISOR); + const uint32_t cache_limit_min = (span_count == 1) ? (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE) : (MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE); + if (counter->cache_limit < cache_limit_min) + counter->cache_limit = cache_limit_min; + if (counter->cache_limit > cache_limit_max) + counter->cache_limit = cache_limit_max; +#else + counter->cache_limit = cache_limit_max; +#endif + if (counter->max_allocations > *global_counter) + *global_counter = counter->max_allocations; + } +} + +#else +# define _memory_counter_increase(counter, global_counter, span_count) do {} while (0) +#endif + +#if ENABLE_STATISTICS +# define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value)) +# define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value)) +#else +# define _memory_statistics_add(atomic_counter, value) do {} while(0) +# define _memory_statistics_sub(atomic_counter, value) do {} while(0) +#endif + +//! Map more virtual memory +static void* +_memory_map(size_t size, size_t* offset) { + assert(!(size % _memory_page_size)); + _memory_statistics_add(&_mapped_pages, (size >> _memory_page_size_shift)); + _memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift)); + return _memory_config.memory_map(size, offset); +} + +//! Unmap virtual memory +static void +_memory_unmap(void* address, size_t size, size_t offset, int release) { + assert((size < _memory_span_size) || !((uintptr_t)address & ~_memory_span_mask)); + assert(!(size % _memory_page_size)); + _memory_statistics_sub(&_mapped_pages, (size >> _memory_page_size_shift)); + _memory_statistics_add(&_unmapped_total, (size >> _memory_page_size_shift)); + _memory_config.memory_unmap(address, size, offset, release); +} + +//! Make flags field in a span from flags, remainder/distance and count +#define SPAN_MAKE_FLAGS(flags, remdist, count) ((uint16_t)((flags) | ((uint16_t)((remdist) - 1) << 2) | ((uint16_t)((count) - 1) << 9))); assert((flags) < 4); assert((remdist) && (remdist) < 128); assert((count) && (count) < 128) +//! Check if span has any of the given flags +#define SPAN_HAS_FLAG(flags, flag) ((flags) & (flag)) +//! Get the distance from flags field +#define SPAN_DISTANCE(flags) (1 + (((flags) >> 2) & 0x7f)) +//! Get the remainder from flags field +#define SPAN_REMAINS(flags) (1 + (((flags) >> 2) & 0x7f)) +//! Get the count from flags field +#define SPAN_COUNT(flags) (1 + (((flags) >> 9) & 0x7f)) +//! Set the remainder in the flags field (MUST be done from the owner heap thread) +#define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)(((flags) & 0xfe03) | ((uint16_t)((remains) - 1) << 2))); assert((remains) < 128) + +//! Resize the given super span to the given count of spans, store the remainder in the heap reserved spans fields +static void +_memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_count) { + size_t current_count = SPAN_COUNT(span->flags); + + assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); + assert((current_count > 1) && (current_count < 127)); + assert(!heap->spans_reserved); + assert((size_t)SPAN_COUNT(span->flags) == current_count); + assert(current_count > use_count); + + heap->span_reserve = (span_t*)pointer_offset(span, use_count * _memory_span_size); + heap->spans_reserved = current_count - use_count; + if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) { + //We must store the heap id before setting as master, to force unmaps to defer to this heap thread + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + heap->span_reserve_master = span; + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count); + _memory_statistics_add(&_reserved_spans, current_count); + } + else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) { + //Only owner heap thread can modify a master span + assert(atomic_load32(&span->heap_id) == heap->id); + uint16_t remains = SPAN_REMAINS(span->flags); + assert(remains >= current_count); + heap->span_reserve_master = span; + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count); + } + else { //SPAN_FLAG_SUBSPAN + //Resizing a subspan is a safe operation in any thread + uint16_t distance = SPAN_DISTANCE(span->flags); + span_t* master = (span_t*)pointer_offset(span, -(int)distance * (int)_memory_span_size); + heap->span_reserve_master = master; + assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER)); + assert((size_t)SPAN_REMAINS(master->flags) >= current_count); + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count); + } + assert((SPAN_COUNT(span->flags) + heap->spans_reserved) == current_count); +} + +//! Map in memory pages for the given number of spans (or use previously reserved pages) +static span_t* +_memory_map_spans(heap_t* heap, size_t span_count) { + if (span_count <= heap->spans_reserved) { + span_t* span = heap->span_reserve; + heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size); + heap->spans_reserved -= span_count; + //Declare the span to be a subspan with given distance from master span + uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) >> _memory_span_size_shift); + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count); + span->data.block.align_offset = 0; + return span; + } + + //We cannot request extra spans if we already have some (but not enough) pending reserved spans + size_t request_spans = (heap->spans_reserved || (span_count > _memory_config.span_map_count)) ? span_count : _memory_config.span_map_count; + size_t align_offset = 0; + span_t* span = (span_t*)_memory_map(request_spans * _memory_span_size, &align_offset); + span->flags = SPAN_MAKE_FLAGS(0, request_spans, request_spans); + span->data.block.align_offset = (uint16_t)align_offset; + if (request_spans > span_count) { + //We have extra spans, store them as reserved spans in heap + _memory_set_span_remainder_as_reserved(heap, span, span_count); + } + return span; +} + +//! Defer unmapping of the given span to the owner heap +static int +_memory_unmap_defer(int32_t heap_id, span_t* span) { + //Get the heap and link in pointer in list of deferred operations + heap_t* heap = _memory_heap_lookup(heap_id); + if (!heap) + return 0; + atomic_store32(&span->heap_id, heap_id); + void* last_ptr; + do { + last_ptr = atomic_load_ptr(&heap->defer_unmap); + span->next_span = (span_t*)last_ptr; + } while (!atomic_cas_ptr(&heap->defer_unmap, span, last_ptr)); + return 1; +} + +//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings) +static void +_memory_unmap_span(heap_t* heap, span_t* span) { + size_t span_count = SPAN_COUNT(span->flags); + assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); + //A plain run of spans can be unmapped directly + if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) { + _memory_unmap(span, span_count * _memory_span_size, span->data.list.align_offset, 1); + return; + } + + uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER); + span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size)); + + assert(is_master || SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); + assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER)); + + //Check if we own the master span, otherwise defer (only owner of master span can modify remainder field) + int32_t master_heap_id = atomic_load32(&master->heap_id); + if (heap && (master_heap_id != heap->id)) { + if (_memory_unmap_defer(master_heap_id, span)) + return; + } + if (!is_master) { + //Directly unmap subspans + assert(span->data.list.align_offset == 0); + _memory_unmap(span, span_count * _memory_span_size, 0, 0); + _memory_statistics_sub(&_reserved_spans, span_count); + } + else { + //Special double flag to denote an unmapped master + //It must be kept in memory since span header must be used + span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN; + } + //We are in owner thread of the master span + uint32_t remains = SPAN_REMAINS(master->flags); + assert(remains >= span_count); + remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count); + if (!remains) { + //Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span + assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN)); + span_count = SPAN_COUNT(master->flags); + _memory_unmap(master, span_count * _memory_span_size, master->data.list.align_offset, 1); + _memory_statistics_sub(&_reserved_spans, span_count); + } + else { + //Set remaining spans + SPAN_SET_REMAINS(master->flags, remains); + } +} + +//! Process pending deferred cross-thread unmaps +static span_t* +_memory_unmap_deferred(heap_t* heap, size_t wanted_count) { + //Grab the current list of deferred unmaps + atomic_thread_fence_acquire(); + span_t* span = (span_t*)atomic_load_ptr(&heap->defer_unmap); + if (!span || !atomic_cas_ptr(&heap->defer_unmap, 0, span)) + return 0; + span_t* found_span = 0; + do { + //Verify that we own the master span, otherwise re-defer to owner + void* next = span->next_span; + size_t span_count = SPAN_COUNT(span->flags); + if (!found_span && span_count == wanted_count) { + assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); + found_span = span; + } + else { + uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER); + span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size)); + int32_t master_heap_id = atomic_load32(&master->heap_id); + if ((atomic_load32(&span->heap_id) == master_heap_id) || + !_memory_unmap_defer(master_heap_id, span)) { + //We own the master span (or heap merged and abandoned) + _memory_unmap_span(heap, span); + } + } + span = (span_t*)next; + } while (span); + return found_span; +} + +//! Unmap a single linked list of spans +static void +_memory_unmap_span_list(heap_t* heap, span_t* span) { + size_t list_size = span->data.list.size; + for (size_t ispan = 0; ispan < list_size; ++ispan) { + span_t* next_span = span->next_span; + _memory_unmap_span(heap, span); + span = next_span; + } + assert(!span); +} + +#if ENABLE_THREAD_CACHE + +//! Split a super span in two +static span_t* +_memory_span_split(heap_t* heap, span_t* span, size_t use_count) { + uint16_t distance = 0; + size_t current_count = SPAN_COUNT(span->flags); + assert(current_count > use_count); + assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); + if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) { + //Must store heap in master span before use, to avoid issues when unmapping subspans + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count); + _memory_statistics_add(&_reserved_spans, current_count); + } + else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) { + //Only valid to call on master span if we own it + assert(atomic_load32(&span->heap_id) == heap->id); + uint16_t remains = SPAN_REMAINS(span->flags); + assert(remains >= current_count); + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count); + } + else { //SPAN_FLAG_SUBSPAN + distance = SPAN_DISTANCE(span->flags); + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count); + } + //Setup remainder as a subspan + span_t* subspan = (span_t*)pointer_offset(span, use_count * _memory_span_size); + subspan->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance + use_count, current_count - use_count); + subspan->data.list.align_offset = 0; + return subspan; +} + +//! Add span to head of single linked span list +static size_t +_memory_span_list_push(span_t** head, span_t* span) { + span->next_span = *head; + if (*head) + span->data.list.size = (*head)->data.list.size + 1; + else + span->data.list.size = 1; + *head = span; + return span->data.list.size; +} + +//! Remove span from head of single linked span list, returns the new list head +static span_t* +_memory_span_list_pop(span_t** head) { + span_t* span = *head; + span_t* next_span = 0; + if (span->data.list.size > 1) { + next_span = span->next_span; + assert(next_span); + next_span->data.list.size = span->data.list.size - 1; + } + *head = next_span; + return span; +} + +//! Split a single linked span list +static span_t* +_memory_span_list_split(span_t* span, size_t limit) { + span_t* next = 0; + if (limit < 2) + limit = 2; + if (span->data.list.size > limit) { + count_t list_size = 1; + span_t* last = span; + next = span->next_span; + while (list_size < limit) { + last = next; + next = next->next_span; + ++list_size; + } + last->next_span = 0; + assert(next); + next->data.list.size = span->data.list.size - list_size; + span->data.list.size = list_size; + span->prev_span = 0; + } + return next; +} + +#endif + +//! Add a span to a double linked list +static void +_memory_span_list_doublelink_add(span_t** head, span_t* span) { + if (*head) { + (*head)->prev_span = span; + span->next_span = *head; + } + else { + span->next_span = 0; + } + *head = span; +} + +//! Remove a span from a double linked list +static void +_memory_span_list_doublelink_remove(span_t** head, span_t* span) { + if (*head == span) { + *head = span->next_span; + } + else { + span_t* next_span = span->next_span; + span_t* prev_span = span->prev_span; + if (next_span) + next_span->prev_span = prev_span; + prev_span->next_span = next_span; + } +} + +#if ENABLE_GLOBAL_CACHE + +//! Insert the given list of memory page spans in the global cache +static void +_memory_cache_insert(heap_t* heap, global_cache_t* cache, span_t* span, size_t cache_limit) { + assert((span->data.list.size == 1) || (span->next_span != 0)); + int32_t list_size = (int32_t)span->data.list.size; + //Unmap if cache has reached the limit + if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) { + _memory_unmap_span_list(heap, span); + atomic_add32(&cache->size, -list_size); + return; + } + void* current_cache, *new_cache; + do { + current_cache = atomic_load_ptr(&cache->cache); + span->prev_span = (span_t*)(void*)((uintptr_t)current_cache & _memory_span_mask); + new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); + } while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache)); +} + +//! Extract a number of memory page spans from the global cache +static span_t* +_memory_cache_extract(global_cache_t* cache) { + uintptr_t span_ptr; + do { + void* global_span = atomic_load_ptr(&cache->cache); + span_ptr = (uintptr_t)global_span & _memory_span_mask; + if (span_ptr) { + span_t* span = (span_t*)(void*)span_ptr; + //By accessing the span ptr before it is swapped out of list we assume that a contending thread + //does not manage to traverse the span to being unmapped before we access it + void* new_cache = (void*)((uintptr_t)span->prev_span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); + if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) { + atomic_add32(&cache->size, -(int32_t)span->data.list.size); + return span; + } + } + } while (span_ptr); + return 0; +} + +//! Finalize a global cache, only valid from allocator finalization (not thread safe) +static void +_memory_cache_finalize(global_cache_t* cache) { + void* current_cache = atomic_load_ptr(&cache->cache); + span_t* span = (span_t*)(void*)((uintptr_t)current_cache & _memory_span_mask); + while (span) { + span_t* skip_span = (span_t*)(void*)((uintptr_t)span->prev_span & _memory_span_mask); + atomic_add32(&cache->size, -(int32_t)span->data.list.size); + _memory_unmap_span_list(0, span); + span = skip_span; + } + assert(!atomic_load32(&cache->size)); + atomic_store_ptr(&cache->cache, 0); + atomic_store32(&cache->size, 0); +} + +//! Insert the given list of memory page spans in the global cache +static void +_memory_global_cache_insert(heap_t* heap, span_t* span) { + //Calculate adaptive limits + size_t span_count = SPAN_COUNT(span->flags); + const size_t cache_divisor = (span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2); + const size_t cache_limit = (MAX_GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor; + const size_t cache_limit_min = MAX_GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE); + _memory_cache_insert(heap, &_memory_span_cache[span_count - 1], span, cache_limit > cache_limit_min ? cache_limit : cache_limit_min); +} + +//! Extract a number of memory page spans from the global cache for large blocks +static span_t* +_memory_global_cache_extract(size_t span_count) { + span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]); + assert(!span || ((size_t)SPAN_COUNT(span->flags) == span_count)); + return span; +} + +#endif + +//! Insert a single span into thread heap cache, releasing to global cache if overflow +static void +_memory_heap_cache_insert(heap_t* heap, span_t* span) { +#if ENABLE_THREAD_CACHE + size_t span_count = SPAN_COUNT(span->flags); + size_t idx = span_count - 1; + if (_memory_span_list_push(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit) + return; + heap->span_cache[idx] = _memory_span_list_split(span, heap->span_counter[idx].cache_limit); + assert(span->data.list.size == heap->span_counter[idx].cache_limit); +#if ENABLE_STATISTICS + heap->thread_to_global += (size_t)span->data.list.size * span_count * _memory_span_size; +#endif +#if ENABLE_GLOBAL_CACHE + _memory_global_cache_insert(heap, span); +#else + _memory_unmap_span_list(heap, span); +#endif +#else + _memory_unmap_span(heap, span); +#endif +} + +//! Extract the given number of spans from the different cache levels +static span_t* +_memory_heap_cache_extract(heap_t* heap, size_t span_count) { +#if ENABLE_THREAD_CACHE + size_t idx = span_count - 1; + //Step 1: check thread cache + if (heap->span_cache[idx]) + return _memory_span_list_pop(&heap->span_cache[idx]); +#endif + //Step 2: Check reserved spans + if (heap->spans_reserved >= span_count) + return _memory_map_spans(heap, span_count); + //Step 3: Try processing deferred unmappings + span_t* span = _memory_unmap_deferred(heap, span_count); + if (span) + return span; +#if ENABLE_THREAD_CACHE + //Step 4: Check larger super spans and split if we find one + for (++idx; idx < LARGE_CLASS_COUNT; ++idx) { + if (heap->span_cache[idx]) { + span = _memory_span_list_pop(&heap->span_cache[idx]); + break; + } + } + if (span) { + //Mark the span as owned by this heap before splitting + size_t got_count = SPAN_COUNT(span->flags); + assert(got_count > span_count); + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + + //Split the span and store as reserved if no previously reserved spans, or in thread cache otherwise + span_t* subspan = _memory_span_split(heap, span, span_count); + assert((size_t)(SPAN_COUNT(span->flags) + SPAN_COUNT(subspan->flags)) == got_count); + assert((size_t)SPAN_COUNT(span->flags) == span_count); + if (!heap->spans_reserved) { + heap->spans_reserved = got_count - span_count; + heap->span_reserve = subspan; + heap->span_reserve_master = (span_t*)pointer_offset(subspan, -(int32_t)SPAN_DISTANCE(subspan->flags) * (int32_t)_memory_span_size); + } + else { + _memory_heap_cache_insert(heap, subspan); + } + return span; + } +#if ENABLE_GLOBAL_CACHE + //Step 5: Extract from global cache + idx = span_count - 1; + heap->span_cache[idx] = _memory_global_cache_extract(span_count); + if (heap->span_cache[idx]) { +#if ENABLE_STATISTICS + heap->global_to_thread += (size_t)heap->span_cache[idx]->data.list.size * span_count * _memory_span_size; +#endif + return _memory_span_list_pop(&heap->span_cache[idx]); + } +#endif +#endif + return 0; +} + +//! Allocate a small/medium sized memory block from the given heap +static void* +_memory_allocate_from_heap(heap_t* heap, size_t size) { + //Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes) + const size_t base_idx = (size <= SMALL_SIZE_LIMIT) ? + ((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) : + SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT); + assert(!base_idx || ((base_idx - 1) < SIZE_CLASS_COUNT)); + const size_t class_idx = _memory_size_class[base_idx ? (base_idx - 1) : 0].class_idx; + + span_block_t* active_block = heap->active_block + class_idx; + size_class_t* size_class = _memory_size_class + class_idx; + const count_t class_size = size_class->size; + + //Step 1: Try to get a block from the currently active span. The span block bookkeeping + // data for the active span is stored in the heap for faster access +use_active: + if (active_block->free_count) { + //Happy path, we have a span with at least one free block + span_t* span = heap->active_span[class_idx]; + count_t offset = class_size * active_block->free_list; + uint32_t* block = (uint32_t*)pointer_offset(span, SPAN_HEADER_SIZE + offset); + assert(span); + + --active_block->free_count; + if (!active_block->free_count) { + //Span is now completely allocated, set the bookkeeping data in the + //span itself and reset the active span pointer in the heap + span->data.block.free_count = 0; + span->data.block.first_autolink = (uint16_t)size_class->block_count; + heap->active_span[class_idx] = 0; + } + else { + //Get the next free block, either from linked list or from auto link + if (active_block->free_list < active_block->first_autolink) { + active_block->free_list = (uint16_t)(*block); + } + else { + ++active_block->free_list; + ++active_block->first_autolink; + } + assert(active_block->free_list < size_class->block_count); + } + + return block; + } + + //Step 2: No active span, try executing deferred deallocations and try again if there + // was at least one of the requested size class + if (_memory_deallocate_deferred(heap, class_idx)) { + if (active_block->free_count) + goto use_active; + } + + //Step 3: Check if there is a semi-used span of the requested size class available + if (heap->size_cache[class_idx]) { + //Promote a pending semi-used span to be active, storing bookkeeping data in + //the heap structure for faster access + span_t* span = heap->size_cache[class_idx]; + *active_block = span->data.block; + assert(active_block->free_count > 0); + heap->size_cache[class_idx] = span->next_span; + heap->active_span[class_idx] = span; + + //Mark span as owned by this heap + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + + goto use_active; + } + + //Step 4: Find a span in one of the cache levels + span_t* span = _memory_heap_cache_extract(heap, 1); + if (!span) { + //Step 5: Map in more virtual memory + span = _memory_map_spans(heap, 1); + } + + //Mark span as owned by this heap and set base data + assert(SPAN_COUNT(span->flags) == 1); + span->size_class = (uint16_t)class_idx; + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + + //If we only have one block we will grab it, otherwise + //set span as new span to use for next allocation + if (size_class->block_count > 1) { + //Reset block order to sequential auto linked order + active_block->free_count = (uint16_t)(size_class->block_count - 1); + active_block->free_list = 1; + active_block->first_autolink = 1; + heap->active_span[class_idx] = span; + } + else { + span->data.block.free_count = 0; + span->data.block.first_autolink = (uint16_t)size_class->block_count; + } + + //Track counters + _memory_counter_increase(&heap->span_counter[0], &_memory_max_allocation[0], 1); + + //Return first block if memory page span + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a large sized memory block from the given heap +static void* +_memory_allocate_large_from_heap(heap_t* heap, size_t size) { + //Calculate number of needed max sized spans (including header) + //Since this function is never called if size > LARGE_SIZE_LIMIT + //the span_count is guaranteed to be <= LARGE_CLASS_COUNT + size += SPAN_HEADER_SIZE; + size_t span_count = size >> _memory_span_size_shift; + if (size & (_memory_span_size - 1)) + ++span_count; + size_t idx = span_count - 1; + +#if ENABLE_THREAD_CACHE + if (!heap->span_cache[idx]) + _memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx); +#else + _memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx); +#endif + //Step 1: Find span in one of the cache levels + span_t* span = _memory_heap_cache_extract(heap, span_count); + if (!span) { + //Step 2: Map in more virtual memory + span = _memory_map_spans(heap, span_count); + } + + //Mark span as owned by this heap and set base data + assert((size_t)SPAN_COUNT(span->flags) == span_count); + span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx); + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + + //Increase counter + _memory_counter_increase(&heap->span_counter[idx], &_memory_max_allocation[idx], span_count); + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a new heap +static heap_t* +_memory_allocate_heap(void) { + void* raw_heap; + void* next_raw_heap; + uintptr_t orphan_counter; + heap_t* heap; + heap_t* next_heap; + //Try getting an orphaned heap + atomic_thread_fence_acquire(); + do { + raw_heap = atomic_load_ptr(&_memory_orphan_heaps); + heap = (heap_t*)(void*)((uintptr_t)raw_heap & _memory_page_mask); + if (!heap) + break; + next_heap = heap->next_orphan; + orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); + next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & ~_memory_page_mask)); + } + while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap)); + + if (!heap) { + //Map in pages for a new heap + size_t align_offset = 0; + heap = (heap_t*)_memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset); + memset(heap, 0, sizeof(heap_t)); + heap->align_offset = align_offset; + + //Get a new heap ID + do { + heap->id = atomic_incr32(&_memory_heap_id); + if (_memory_heap_lookup(heap->id)) + heap->id = 0; + } while (!heap->id); + + //Link in heap in heap ID map + size_t list_idx = heap->id % HEAP_ARRAY_SIZE; + do { + next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + heap->next_heap = next_heap; + } while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap)); + } + +#if ENABLE_THREAD_CACHE + heap->span_counter[0].cache_limit = MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE; + for (size_t idx = 1; idx < LARGE_CLASS_COUNT; ++idx) + heap->span_counter[idx].cache_limit = MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE; +#endif + + //Clean up any deferred operations + _memory_unmap_deferred(heap, 0); + _memory_deallocate_deferred(heap, 0); + + return heap; +} + +//! Deallocate the given small/medium memory block from the given heap +static void +_memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) { + //Check if span is the currently active span in order to operate + //on the correct bookkeeping data + assert(SPAN_COUNT(span->flags) == 1); + const count_t class_idx = span->size_class; + size_class_t* size_class = _memory_size_class + class_idx; + int is_active = (heap->active_span[class_idx] == span); + span_block_t* block_data = is_active ? + heap->active_block + class_idx : + &span->data.block; + + //Check if the span will become completely free + if (block_data->free_count == ((count_t)size_class->block_count - 1)) { +#if ENABLE_THREAD_CACHE + //Track counters + assert(heap->span_counter[0].current_allocations > 0); + if (heap->span_counter[0].current_allocations) + --heap->span_counter[0].current_allocations; +#endif + + //If it was active, reset counter. Otherwise, if not active, remove from + //partial free list if we had a previous free block (guard for classes with only 1 block) + if (is_active) + block_data->free_count = 0; + else if (block_data->free_count > 0) + _memory_span_list_doublelink_remove(&heap->size_cache[class_idx], span); + + //Add to heap span cache + _memory_heap_cache_insert(heap, span); + return; + } + + //Check if first free block for this span (previously fully allocated) + if (block_data->free_count == 0) { + //add to free list and disable autolink + _memory_span_list_doublelink_add(&heap->size_cache[class_idx], span); + block_data->first_autolink = (uint16_t)size_class->block_count; + } + ++block_data->free_count; + //Span is not yet completely free, so add block to the linked list of free blocks + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + count_t block_offset = (count_t)pointer_diff(p, blocks_start); + count_t block_idx = block_offset / (count_t)size_class->size; + uint32_t* block = (uint32_t*)pointer_offset(blocks_start, block_idx * size_class->size); + *block = block_data->free_list; + block_data->free_list = (uint16_t)block_idx; +} + +//! Deallocate the given large memory block from the given heap +static void +_memory_deallocate_large_to_heap(heap_t* heap, span_t* span) { + //Decrease counter + size_t idx = (size_t)span->size_class - SIZE_CLASS_COUNT; + size_t span_count = idx + 1; + assert((size_t)SPAN_COUNT(span->flags) == span_count); + assert(span->size_class >= SIZE_CLASS_COUNT); + assert(idx < LARGE_CLASS_COUNT); +#if ENABLE_THREAD_CACHE + assert(heap->span_counter[idx].current_allocations > 0); + if (heap->span_counter[idx].current_allocations) + --heap->span_counter[idx].current_allocations; +#endif + if (!heap->spans_reserved && (span_count > 1)) { + //Split the span and store remainder as reserved spans + //Must split to a dummy 1-span master since we cannot have master spans as reserved + _memory_set_span_remainder_as_reserved(heap, span, 1); + span_count = 1; + } + + //Insert into cache list + _memory_heap_cache_insert(heap, span); +} + +//! Process pending deferred cross-thread deallocations +static int +_memory_deallocate_deferred(heap_t* heap, size_t size_class) { + //Grab the current list of deferred deallocations + atomic_thread_fence_acquire(); + void* p = atomic_load_ptr(&heap->defer_deallocate); + if (!p || !atomic_cas_ptr(&heap->defer_deallocate, 0, p)) + return 0; + //Keep track if we deallocate in the given size class + int got_class = 0; + do { + void* next = *(void**)p; + //Get span and check which type of block + span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); + if (span->size_class < SIZE_CLASS_COUNT) { + //Small/medium block + got_class |= (span->size_class == size_class); + _memory_deallocate_to_heap(heap, span, p); + } + else { + //Large block + got_class |= ((span->size_class >= size_class) && (span->size_class <= (size_class + 2))); + _memory_deallocate_large_to_heap(heap, span); + } + //Loop until all pending operations in list are processed + p = next; + } while (p); + return got_class; +} + +//! Defer deallocation of the given block to the given heap +static void +_memory_deallocate_defer(int32_t heap_id, void* p) { + //Get the heap and link in pointer in list of deferred operations + heap_t* heap = _memory_heap_lookup(heap_id); + if (!heap) + return; + void* last_ptr; + do { + last_ptr = atomic_load_ptr(&heap->defer_deallocate); + *(void**)p = last_ptr; //Safe to use block, it's being deallocated + } while (!atomic_cas_ptr(&heap->defer_deallocate, p, last_ptr)); +} + +//! Allocate a block of the given size +static void* +_memory_allocate(size_t size) { + if (size <= _memory_medium_size_limit) + return _memory_allocate_from_heap(get_thread_heap(), size); + else if (size <= LARGE_SIZE_LIMIT) + return _memory_allocate_large_from_heap(get_thread_heap(), size); + + //Oversized, allocate pages directly + size += SPAN_HEADER_SIZE; + size_t num_pages = size >> _memory_page_size_shift; + if (size & (_memory_page_size - 1)) + ++num_pages; + size_t align_offset = 0; + span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset); + atomic_store32(&span->heap_id, 0); + //Store page count in next_span + span->next_span = (span_t*)((uintptr_t)num_pages); + span->data.list.align_offset = (uint16_t)align_offset; + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Deallocate the given block +static void +_memory_deallocate(void* p) { + if (!p) + return; + + //Grab the span (always at start of span, using 64KiB alignment) + span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); + int32_t heap_id = atomic_load32(&span->heap_id); + heap_t* heap = get_thread_heap(); + //Check if block belongs to this heap or if deallocation should be deferred + if (heap_id == heap->id) { + if (span->size_class < SIZE_CLASS_COUNT) + _memory_deallocate_to_heap(heap, span, p); + else + _memory_deallocate_large_to_heap(heap, span); + } + else if (heap_id > 0) { + _memory_deallocate_defer(heap_id, p); + } + else { + //Oversized allocation, page count is stored in next_span + size_t num_pages = (size_t)span->next_span; + _memory_unmap(span, num_pages * _memory_page_size, span->data.list.align_offset, 1); + } +} + +//! Reallocate the given block to the given size +static void* +_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { + if (p) { + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); + int32_t heap_id = atomic_load32(&span->heap_id); + if (heap_id) { + if (span->size_class < SIZE_CLASS_COUNT) { + //Small/medium sized block + size_class_t* size_class = _memory_size_class + span->size_class; + if ((size_t)size_class->size >= size) + return p; //Still fits in block, never mind trying to save memory + if (!oldsize) + oldsize = size_class->size; + } + else { + //Large block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_spans = total_size >> _memory_span_size_shift; + if (total_size & (_memory_span_mask - 1)) + ++num_spans; + size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; + if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) + return p; //Still fits and less than half of memory would be freed + if (!oldsize) + oldsize = (current_spans * _memory_span_size) - SPAN_HEADER_SIZE; + } + } + else { + //Oversized block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_pages = total_size >> _memory_page_size_shift; + if (total_size & (_memory_page_size - 1)) + ++num_pages; + //Page count is stored in next_span + size_t current_pages = (size_t)span->next_span; + if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) + return p; //Still fits and less than half of memory would be freed + if (!oldsize) + oldsize = (current_pages * _memory_page_size) - SPAN_HEADER_SIZE; + } + } + + //Size is greater than block size, need to allocate a new block and deallocate the old + //Avoid hysteresis by overallocating if increase is small (below 37%) + size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); + void* block = _memory_allocate((size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size)); + if (p) { + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, p, oldsize < size ? oldsize : size); + _memory_deallocate(p); + } + + return block; +} + +//! Get the usable size of the given block +static size_t +_memory_usable_size(void* p) { + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); + int32_t heap_id = atomic_load32(&span->heap_id); + if (heap_id) { + //Small/medium block + if (span->size_class < SIZE_CLASS_COUNT) + return _memory_size_class[span->size_class].size; + + //Large block + size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; + return (current_spans * _memory_span_size) - SPAN_HEADER_SIZE; + } + + //Oversized block, page count is stored in next_span + size_t current_pages = (size_t)span->next_span; + return (current_pages * _memory_page_size) - SPAN_HEADER_SIZE; +} + +//! Adjust and optimize the size class properties for the given class +static void +_memory_adjust_size_class(size_t iclass) { + size_t block_size = _memory_size_class[iclass].size; + size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; + + _memory_size_class[iclass].block_count = (uint16_t)block_count; + _memory_size_class[iclass].class_idx = (uint16_t)iclass; + + //Check if previous size classes can be merged + size_t prevclass = iclass; + while (prevclass > 0) { + --prevclass; + //A class can be merged if number of pages and number of blocks are equal + if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) { + memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); + } + else { + break; + } + } +} + +} + +#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) +# include <windows.h> +#else +# include <sys/mman.h> +# include <sched.h> +# ifndef MAP_UNINITIALIZED +# define MAP_UNINITIALIZED 0 +# endif +#endif +#include <errno.h> + +namespace tracy +{ + +//! Initialize the allocator and setup global data +int +rpmalloc_initialize(void) { + memset(&_memory_config, 0, sizeof(rpmalloc_config_t)); + return rpmalloc_initialize_config(0); +} + +int +rpmalloc_initialize_config(const rpmalloc_config_t* config) { + if (config) + memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); + + if (!_memory_config.memory_map || !_memory_config.memory_unmap) { + _memory_config.memory_map = _memory_map_os; + _memory_config.memory_unmap = _memory_unmap_os; + } + + _memory_page_size = _memory_config.page_size; + if (!_memory_page_size) { +#if PLATFORM_WINDOWS + SYSTEM_INFO system_info; + memset(&system_info, 0, sizeof(system_info)); + GetSystemInfo(&system_info); + _memory_page_size = system_info.dwPageSize; + _memory_map_granularity = system_info.dwAllocationGranularity; +#else + _memory_page_size = (size_t)sysconf(_SC_PAGESIZE); + _memory_map_granularity = _memory_page_size; +#endif + } + + if (_memory_page_size < 512) + _memory_page_size = 512; + if (_memory_page_size > (16 * 1024)) + _memory_page_size = (16 * 1024); + + _memory_page_size_shift = 0; + size_t page_size_bit = _memory_page_size; + while (page_size_bit != 1) { + ++_memory_page_size_shift; + page_size_bit >>= 1; + } + _memory_page_size = ((size_t)1 << _memory_page_size_shift); + _memory_page_mask = ~(uintptr_t)(_memory_page_size - 1); + + size_t span_size = _memory_config.span_size; + if (!span_size) + span_size = (64 * 1024); + if (span_size > (256 * 1024)) + span_size = (256 * 1024); + _memory_span_size = 4096; + _memory_span_size_shift = 12; + while ((_memory_span_size < span_size) || (_memory_span_size < _memory_page_size)) { + _memory_span_size <<= 1; + ++_memory_span_size_shift; + } + _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); + + _memory_config.page_size = _memory_page_size; + _memory_config.span_size = _memory_span_size; + + if (!_memory_config.span_map_count) + _memory_config.span_map_count = DEFAULT_SPAN_MAP_COUNT; + if (_memory_config.span_size * _memory_config.span_map_count < _memory_config.page_size) + _memory_config.span_map_count = (_memory_config.page_size / _memory_config.span_size); + if (_memory_config.span_map_count > 128) + _memory_config.span_map_count = 128; + +#if defined(__APPLE__) && ENABLE_PRELOAD + if (pthread_key_create(&_memory_thread_heap, 0)) + return -1; +#endif + + atomic_store32(&_memory_heap_id, 0); + atomic_store32(&_memory_orphan_counter, 0); + atomic_store32(&_memory_active_heaps, 0); + + //Setup all small and medium size classes + size_t iclass; + for (iclass = 0; iclass < SMALL_CLASS_COUNT; ++iclass) { + size_t size = (iclass + 1) * SMALL_GRANULARITY; + _memory_size_class[iclass].size = (uint16_t)size; + _memory_adjust_size_class(iclass); + } + + _memory_medium_size_limit = _memory_span_size - SPAN_HEADER_SIZE; + if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) + _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; + for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { + size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); + if (size > _memory_medium_size_limit) + size = _memory_medium_size_limit; + _memory_size_class[SMALL_CLASS_COUNT + iclass].size = (uint16_t)size; + _memory_adjust_size_class(SMALL_CLASS_COUNT + iclass); + } + + //Initialize this thread + rpmalloc_thread_initialize(); + return 0; +} + +//! Finalize the allocator +void +rpmalloc_finalize(void) { + atomic_thread_fence_acquire(); + + rpmalloc_thread_finalize(); + //If you hit this assert, you still have active threads or forgot to finalize some thread(s) + assert(atomic_load32(&_memory_active_heaps) == 0); + + //Free all thread caches + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + while (heap) { + _memory_deallocate_deferred(heap, 0); + + //Free span caches (other thread might have deferred after the thread using this heap finalized) +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (heap->span_cache[iclass]) + _memory_unmap_span_list(0, heap->span_cache[iclass]); + } +#endif + heap = heap->next_heap; + } + } + +#if ENABLE_GLOBAL_CACHE + //Free global caches + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) + _memory_cache_finalize(&_memory_span_cache[iclass]); +#endif + + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + atomic_store_ptr(&_memory_heaps[list_idx], 0); + while (heap) { + if (heap->spans_reserved) { + span_t* span = heap->span_reserve; + span_t* master = heap->span_reserve_master; + uint32_t remains = SPAN_REMAINS(master->flags); + + assert(master != span); + assert(remains >= heap->spans_reserved); + _memory_unmap(span, heap->spans_reserved * _memory_span_size, 0, 0); + _memory_statistics_sub(&_reserved_spans, heap->spans_reserved); + remains = ((uint32_t)heap->spans_reserved >= remains) ? 0 : (remains - (uint32_t)heap->spans_reserved); + if (!remains) { + uint32_t master_span_count = SPAN_COUNT(master->flags); + _memory_statistics_sub(&_reserved_spans, master_span_count); + _memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1); + } + else { + SPAN_SET_REMAINS(master->flags, remains); + } + } + + _memory_unmap_deferred(heap, 0); + + heap_t* next_heap = heap->next_heap; + _memory_unmap(heap, (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, heap->align_offset, 1); + heap = next_heap; + } + } + atomic_store_ptr(&_memory_orphan_heaps, 0); + atomic_thread_fence_release(); + +#if ENABLE_STATISTICS + //If you hit these asserts you probably have memory leaks or double frees in your code + assert(!atomic_load32(&_mapped_pages)); + assert(!atomic_load32(&_reserved_spans)); +#endif + +#if defined(__APPLE__) && ENABLE_PRELOAD + pthread_key_delete(_memory_thread_heap); +#endif +} + +//! Initialize thread, assign heap +void +rpmalloc_thread_initialize(void) { + if (!get_thread_heap()) { + atomic_incr32(&_memory_active_heaps); + heap_t* heap = _memory_allocate_heap(); +#if ENABLE_STATISTICS + heap->thread_to_global = 0; + heap->global_to_thread = 0; +#endif + set_thread_heap(heap); + } +} + +//! Finalize thread, orphan heap +void +rpmalloc_thread_finalize(void) { + heap_t* heap = get_thread_heap(); + if (!heap) + return; + + _memory_deallocate_deferred(heap, 0); + _memory_unmap_deferred(heap, 0); + + //Release thread cache spans back to global cache +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_t* span = heap->span_cache[iclass]; +#if ENABLE_GLOBAL_CACHE + const size_t span_count = iclass + 1; + while (span) { + assert((size_t)SPAN_COUNT(span->flags) == span_count); + span_t* next = _memory_span_list_split(span, !iclass ? MIN_SPAN_CACHE_RELEASE : (MIN_LARGE_SPAN_CACHE_RELEASE / span_count)); + _memory_global_cache_insert(0, span); + span = next; + } +#else + if (span) + _memory_unmap_span_list(heap, span); +#endif + heap->span_cache[iclass] = 0; + } +#endif + + //Orphan the heap + void* raw_heap; + uintptr_t orphan_counter; + heap_t* last_heap; + do { + last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps); + heap->next_orphan = (heap_t*)(void*)((uintptr_t)last_heap & _memory_page_mask); + orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); + raw_heap = (void*)((uintptr_t)heap | (orphan_counter & ~_memory_page_mask)); + } + while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap)); + + set_thread_heap(0); + atomic_add32(&_memory_active_heaps, -1); +} + +int +rpmalloc_is_thread_initialized(void) { + return (get_thread_heap() != 0) ? 1 : 0; +} + +const rpmalloc_config_t* +rpmalloc_config(void) { + return &_memory_config; +} + +//! Map new pages to virtual memory +static void* +_memory_map_os(size_t size, size_t* offset) { + //Either size is a heap (a single page) or a (multiple) span - we only need to align spans + size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0; + +#if PLATFORM_WINDOWS + //Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed" + void* ptr = VirtualAlloc(0, size + padding, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + if (!ptr) { + assert("Failed to map virtual memory block" && 0); + return 0; + } +#else + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0); + if ((ptr == MAP_FAILED) || !ptr) { + assert("Failed to map virtual memory block" && 0); + return 0; + } +#endif + + if (padding) { + size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); +#if PLATFORM_POSIX + //Unmap the last unused pages, for Windows this is done with the final VirtualFree with MEM_RELEASE call + size_t remains = padding - final_padding; + if (remains) + munmap(pointer_offset(ptr, final_padding + size), remains); +#endif + ptr = pointer_offset(ptr, final_padding); + assert(final_padding <= _memory_span_size); + assert(!(final_padding & 5)); + assert(!((uintptr_t)ptr & ~_memory_span_mask)); + *offset = final_padding >> 3; + assert(*offset < 65536); + } + + return ptr; +} + +//! Unmap pages from virtual memory +static void +_memory_unmap_os(void* address, size_t size, size_t offset, int release) { + assert(release || (offset == 0)); + if (release && offset) { + offset <<= 3; +#if PLATFORM_POSIX + size += offset; +#endif + address = pointer_offset(address, -(int32_t)offset); + } +#if PLATFORM_WINDOWS + if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) { + DWORD err = GetLastError(); + (void)err; + assert("Failed to unmap virtual memory block" && 0); + } +#else + MEMORY_UNUSED(release); + if (munmap(address, size)) { + assert("Failed to unmap virtual memory block" && 0); + } +#endif +} + +#if ENABLE_GUARDS +static void +_memory_guard_validate(void* p) { + if (!p) + return; + void* block_start; + size_t block_size = _memory_usable_size(p); + span_t* span = (void*)((uintptr_t)p & _memory_span_mask); + int32_t heap_id = atomic_load32(&span->heap_id); + if (heap_id) { + if (span->size_class < SIZE_CLASS_COUNT) { + void* span_blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + size_class_t* size_class = _memory_size_class + span->size_class; + count_t block_offset = (count_t)pointer_diff(p, span_blocks_start); + count_t block_idx = block_offset / (count_t)size_class->size; + block_start = pointer_offset(span_blocks_start, block_idx * size_class->size); + } + else { + block_start = pointer_offset(span, SPAN_HEADER_SIZE); + } + } + else { + block_start = pointer_offset(span, SPAN_HEADER_SIZE); + } + uint32_t* deadzone = block_start; + //If these asserts fire, you have written to memory before the block start + for (int i = 0; i < 8; ++i) { + if (deadzone[i] != MAGIC_GUARD) { + if (_memory_config.memory_overwrite) + _memory_config.memory_overwrite(p); + else + assert("Memory overwrite before block start" && 0); + return; + } + deadzone[i] = 0; + } + deadzone = (uint32_t*)pointer_offset(block_start, block_size - 32); + //If these asserts fire, you have written to memory after the block end + for (int i = 0; i < 8; ++i) { + if (deadzone[i] != MAGIC_GUARD) { + if (_memory_config.memory_overwrite) + _memory_config.memory_overwrite(p); + else + assert("Memory overwrite after block end" && 0); + return; + } + deadzone[i] = 0; + } +} +#else +#define _memory_guard_validate(block) +#endif + +#if ENABLE_GUARDS +static void +_memory_guard_block(void* block) { + if (block) { + size_t block_size = _memory_usable_size(block); + uint32_t* deadzone = block; + deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = + deadzone[4] = deadzone[5] = deadzone[6] = deadzone[7] = MAGIC_GUARD; + deadzone = (uint32_t*)pointer_offset(block, block_size - 32); + deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = + deadzone[4] = deadzone[5] = deadzone[6] = deadzone[7] = MAGIC_GUARD; + } +} +#define _memory_guard_pre_alloc(size) size += 64 +#define _memory_guard_pre_realloc(block, size) block = pointer_offset(block, -32); size += 64 +#define _memory_guard_post_alloc(block, size) _memory_guard_block(block); block = pointer_offset(block, 32); size -= 64 +#else +#define _memory_guard_pre_alloc(size) +#define _memory_guard_pre_realloc(block, size) +#define _memory_guard_post_alloc(block, size) +#endif + +// Extern interface + +TRACY_API RPMALLOC_RESTRICT void* +rpmalloc(size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + _memory_guard_pre_alloc(size); + void* block = _memory_allocate(size); + _memory_guard_post_alloc(block, size); + return block; +} + +TRACY_API void +rpfree(void* ptr) { + _memory_guard_validate(ptr); + _memory_deallocate(ptr); +} + +RPMALLOC_RESTRICT void* +rpcalloc(size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + _memory_guard_pre_alloc(total); + void* block = _memory_allocate(total); + _memory_guard_post_alloc(block, total); + memset(block, 0, total); + return block; +} + +void* +rprealloc(void* ptr, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + _memory_guard_validate(ptr); + _memory_guard_pre_realloc(ptr, size); + void* block = _memory_reallocate(ptr, size, 0, 0); + _memory_guard_post_alloc(block, size); + return block; +} + +void* +rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + void* block; + if (alignment > 32) { + block = rpaligned_alloc(alignment, size); + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, ptr, oldsize < size ? oldsize : size); + rpfree(ptr); + } + else { + _memory_guard_validate(ptr); + _memory_guard_pre_realloc(ptr, size); + block = _memory_reallocate(ptr, size, oldsize, flags); + _memory_guard_post_alloc(block, size); + } + return block; +} + +RPMALLOC_RESTRICT void* +rpaligned_alloc(size_t alignment, size_t size) { + if (alignment <= 32) + return rpmalloc(size); + +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + + void* ptr = rpmalloc(size + alignment); + if ((uintptr_t)ptr & (alignment - 1)) + ptr = (void*)(((uintptr_t)ptr & ~((uintptr_t)alignment - 1)) + alignment); + return ptr; +} + +RPMALLOC_RESTRICT void* +rpmemalign(size_t alignment, size_t size) { + return rpaligned_alloc(alignment, size); +} + +int +rpposix_memalign(void **memptr, size_t alignment, size_t size) { + if (memptr) + *memptr = rpaligned_alloc(alignment, size); + else + return EINVAL; + return *memptr ? 0 : ENOMEM; +} + +size_t +rpmalloc_usable_size(void* ptr) { + size_t size = 0; + if (ptr) { + size = _memory_usable_size(ptr); +#if ENABLE_GUARDS + size -= 64; +#endif + } + return size; +} + +void +rpmalloc_thread_collect(void) { + heap_t* heap = get_thread_heap(); + _memory_unmap_deferred(heap, 0); + _memory_deallocate_deferred(0, 0); +} + +void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { + memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); + heap_t* heap = get_thread_heap(); + void* p = atomic_load_ptr(&heap->defer_deallocate); + while (p) { + void* next = *(void**)p; + span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); + stats->deferred += _memory_size_class[span->size_class].size; + p = next; + } + + for (size_t isize = 0; isize < SIZE_CLASS_COUNT; ++isize) { + if (heap->active_block[isize].free_count) + stats->active += heap->active_block[isize].free_count * _memory_size_class[heap->active_span[isize]->size_class].size; + + span_t* cache = heap->size_cache[isize]; + while (cache) { + stats->sizecache = cache->data.block.free_count * _memory_size_class[cache->size_class].size; + cache = cache->next_span; + } + } + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (heap->span_cache[iclass]) + stats->spancache = (size_t)heap->span_cache[iclass]->data.list.size * (iclass + 1) * _memory_span_size; + } +#endif +} + +void +rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) { + memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); +#if ENABLE_STATISTICS + stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; + stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size; + stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; +#endif +#if ENABLE_GLOBAL_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size; + } +#endif +} + +} + +#ifdef _MSC_VER +# pragma warning( pop ) +#endif + +#endif diff --git a/libs/tracy/client/tracy_rpmalloc.hpp b/libs/tracy/client/tracy_rpmalloc.hpp @@ -0,0 +1,153 @@ +/* rpmalloc.h - Memory allocator - Public Domain - 2016 Mattias Jansson / Rampant Pixels + * + * This library provides a cross-platform lock free thread caching malloc implementation in C11. + * The latest source code is always available at + * + * https://github.com/rampantpixels/rpmalloc + * + * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. + * + */ + +#pragma once + +#include <stddef.h> + +#include "../common/TracyApi.h" + +namespace tracy +{ + +#if defined(__clang__) || defined(__GNUC__) +# define RPMALLOC_ATTRIBUTE __attribute__((__malloc__)) +# define RPMALLOC_RESTRICT +# define RPMALLOC_CDECL +#elif defined(_MSC_VER) +# define RPMALLOC_ATTRIBUTE +# define RPMALLOC_RESTRICT __declspec(restrict) +# define RPMALLOC_CDECL __cdecl +#else +# define RPMALLOC_ATTRIBUTE +# define RPMALLOC_RESTRICT +# define RPMALLOC_CDECL +#endif + +//! Flag to rpaligned_realloc to not preserve content in reallocation +#define RPMALLOC_NO_PRESERVE 1 + +typedef struct rpmalloc_global_statistics_t { + //! Current amount of virtual memory mapped (only if ENABLE_STATISTICS=1) + size_t mapped; + //! Current amount of memory in global caches for small and medium sizes (<64KiB) + size_t cached; + //! Total amount of memory mapped (only if ENABLE_STATISTICS=1) + size_t mapped_total; + //! Total amount of memory unmapped (only if ENABLE_STATISTICS=1) + size_t unmapped_total; +} rpmalloc_global_statistics_t; + +typedef struct rpmalloc_thread_statistics_t { + //! Current number of bytes available for allocation from active spans + size_t active; + //! Current number of bytes available in thread size class caches + size_t sizecache; + //! Current number of bytes available in thread span caches + size_t spancache; + //! Current number of bytes in pending deferred deallocations + size_t deferred; + //! Total number of bytes transitioned from thread cache to global cache + size_t thread_to_global; + //! Total number of bytes transitioned from global cache to thread cache + size_t global_to_thread; +} rpmalloc_thread_statistics_t; + +typedef struct rpmalloc_config_t { + //! Map memory pages for the given number of bytes. The returned address MUST be + // aligned to the rpmalloc span size, which will always be a power of two. + // Optionally the function can store an alignment offset in the offset variable + // in case it performs alignment and the returned pointer is offset from the + // actual start of the memory region due to this alignment. The alignment offset + // will be passed to the memory unmap function. The alignment offset MUST NOT be + // larger than 65535 (storable in an uint16_t), if it is you must use natural + // alignment to shift it into 16 bits. + void* (*memory_map)(size_t size, size_t* offset); + //! Unmap the memory pages starting at address and spanning the given number of bytes. + // If release is set to 1, the unmap is for an entire span range as returned by + // a previous call to memory_map and that the entire range should be released. + // If release is set to 0, the unmap is a partial decommit of a subset of the mapped + // memory range. + void (*memory_unmap)(void* address, size_t size, size_t offset, int release); + //! Size of memory pages. The page size MUST be a power of two in [512,16384] range + // (2^9 to 2^14) unless 0 - set to 0 to use system page size. All memory mapping + // requests to memory_map will be made with size set to a multiple of the page size. + size_t page_size; + //! Size of a span of memory pages. MUST be a multiple of page size, and in [4096,262144] + // range (unless 0 - set to 0 to use the default span size). + size_t span_size; + //! Number of spans to map at each request to map new virtual memory blocks. This can + // be used to minimize the system call overhead at the cost of virtual memory address + // space. The extra mapped pages will not be written until actually used, so physical + // committed memory should not be affected in the default implementation. + size_t span_map_count; + //! Debug callback if memory guards are enabled. Called if a memory overwrite is detected + void (*memory_overwrite)(void* address); +} rpmalloc_config_t; + +extern int +rpmalloc_initialize(void); + +extern int +rpmalloc_initialize_config(const rpmalloc_config_t* config); + +extern const rpmalloc_config_t* +rpmalloc_config(void); + +extern void +rpmalloc_finalize(void); + +void +rpmalloc_thread_initialize(void); + +extern void +rpmalloc_thread_finalize(void); + +extern void +rpmalloc_thread_collect(void); + +extern int +rpmalloc_is_thread_initialized(void); + +extern void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats); + +extern void +rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats); + +TRACY_API RPMALLOC_RESTRICT void* +rpmalloc(size_t size) RPMALLOC_ATTRIBUTE; + +TRACY_API void +rpfree(void* ptr); + +extern RPMALLOC_RESTRICT void* +rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIBUTE; + +extern void* +rprealloc(void* ptr, size_t size); + +extern void* +rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags); + +extern RPMALLOC_RESTRICT void* +rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE; + +extern RPMALLOC_RESTRICT void* +rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE; + +extern int +rpposix_memalign(void **memptr, size_t alignment, size_t size); + +extern size_t +rpmalloc_usable_size(void* ptr); + +} diff --git a/libs/tracy/common/TracyAlign.hpp b/libs/tracy/common/TracyAlign.hpp @@ -0,0 +1,27 @@ +#ifndef __TRACYALIGN_HPP__ +#define __TRACYALIGN_HPP__ + +#include <string.h> + +#include "TracyForceInline.hpp" + +namespace tracy +{ + +template<typename T> +tracy_force_inline T MemRead( const void* ptr ) +{ + T val; + memcpy( &val, ptr, sizeof( T ) ); + return val; +} + +template<typename T> +tracy_force_inline void MemWrite( void* ptr, T val ) +{ + memcpy( ptr, &val, sizeof( T ) ); +} + +} + +#endif diff --git a/libs/tracy/common/TracyAlloc.hpp b/libs/tracy/common/TracyAlloc.hpp @@ -0,0 +1,33 @@ +#ifndef __TRACYALLOC_HPP__ +#define __TRACYALLOC_HPP__ + +#include <stdlib.h> + +#ifdef TRACY_ENABLE +# include "../client/tracy_rpmalloc.hpp" +#endif + +namespace tracy +{ + +static inline void* tracy_malloc( size_t size ) +{ +#ifdef TRACY_ENABLE + return rpmalloc( size ); +#else + return malloc( size ); +#endif +} + +static inline void tracy_free( void* ptr ) +{ +#ifdef TRACY_ENABLE + rpfree( ptr ); +#else + free( ptr ); +#endif +} + +} + +#endif diff --git a/libs/tracy/common/TracyApi.h b/libs/tracy/common/TracyApi.h @@ -0,0 +1,14 @@ +#ifndef __TRACYAPI_H__ +#define __TRACYAPI_H__ + +#ifdef _WIN32 +# if defined TRACY_IMPORTS +# define TRACY_API __declspec(dllimport) +# else +# define TRACY_API __declspec(dllexport) +# endif +#else +# define TRACY_API __attribute__((visibility("default"))) +#endif + +#endif // __TRACYAPI_H__ diff --git a/libs/tracy/common/TracyColor.hpp b/libs/tracy/common/TracyColor.hpp @@ -0,0 +1,690 @@ +#ifndef __TRACYCOLOR_HPP__ +#define __TRACYCOLOR_HPP__ + +namespace tracy +{ +struct Color +{ +enum ColorType +{ + Snow = 0xfffafa, + GhostWhite = 0xf8f8ff, + WhiteSmoke = 0xf5f5f5, + Gainsboro = 0xdcdcdc, + FloralWhite = 0xfffaf0, + OldLace = 0xfdf5e6, + Linen = 0xfaf0e6, + AntiqueWhite = 0xfaebd7, + PapayaWhip = 0xffefd5, + BlanchedAlmond = 0xffebcd, + Bisque = 0xffe4c4, + PeachPuff = 0xffdab9, + NavajoWhite = 0xffdead, + Moccasin = 0xffe4b5, + Cornsilk = 0xfff8dc, + Ivory = 0xfffff0, + LemonChiffon = 0xfffacd, + Seashell = 0xfff5ee, + Honeydew = 0xf0fff0, + MintCream = 0xf5fffa, + Azure = 0xf0ffff, + AliceBlue = 0xf0f8ff, + Lavender = 0xe6e6fa, + LavenderBlush = 0xfff0f5, + MistyRose = 0xffe4e1, + White = 0xffffff, + Black = 0x000000, + DarkSlateGray = 0x2f4f4f, + DarkSlateGrey = 0x2f4f4f, + DimGray = 0x696969, + DimGrey = 0x696969, + SlateGray = 0x708090, + SlateGrey = 0x708090, + LightSlateGray = 0x778899, + LightSlateGrey = 0x778899, + Gray = 0xbebebe, + Grey = 0xbebebe, + X11Gray = 0xbebebe, + X11Grey = 0xbebebe, + WebGray = 0x808080, + WebGrey = 0x808080, + LightGrey = 0xd3d3d3, + LightGray = 0xd3d3d3, + MidnightBlue = 0x191970, + Navy = 0x000080, + NavyBlue = 0x000080, + CornflowerBlue = 0x6495ed, + DarkSlateBlue = 0x483d8b, + SlateBlue = 0x6a5acd, + MediumSlateBlue = 0x7b68ee, + LightSlateBlue = 0x8470ff, + MediumBlue = 0x0000cd, + RoyalBlue = 0x4169e1, + Blue = 0x0000ff, + DodgerBlue = 0x1e90ff, + DeepSkyBlue = 0x00bfff, + SkyBlue = 0x87ceeb, + LightSkyBlue = 0x87cefa, + SteelBlue = 0x4682b4, + LightSteelBlue = 0xb0c4de, + LightBlue = 0xadd8e6, + PowderBlue = 0xb0e0e6, + PaleTurquoise = 0xafeeee, + DarkTurquoise = 0x00ced1, + MediumTurquoise = 0x48d1cc, + Turquoise = 0x40e0d0, + Cyan = 0x00ffff, + Aqua = 0x00ffff, + LightCyan = 0xe0ffff, + CadetBlue = 0x5f9ea0, + MediumAquamarine = 0x66cdaa, + Aquamarine = 0x7fffd4, + DarkGreen = 0x006400, + DarkOliveGreen = 0x556b2f, + DarkSeaGreen = 0x8fbc8f, + SeaGreen = 0x2e8b57, + MediumSeaGreen = 0x3cb371, + LightSeaGreen = 0x20b2aa, + PaleGreen = 0x98fb98, + SpringGreen = 0x00ff7f, + LawnGreen = 0x7cfc00, + Green = 0x00ff00, + Lime = 0x00ff00, + X11Green = 0x00ff00, + WebGreen = 0x008000, + Chartreuse = 0x7fff00, + MediumSpringGreen = 0x00fa9a, + GreenYellow = 0xadff2f, + LimeGreen = 0x32cd32, + YellowGreen = 0x9acd32, + ForestGreen = 0x228b22, + OliveDrab = 0x6b8e23, + DarkKhaki = 0xbdb76b, + Khaki = 0xf0e68c, + PaleGoldenrod = 0xeee8aa, + LightGoldenrodYellow = 0xfafad2, + LightYellow = 0xffffe0, + Yellow = 0xffff00, + Gold = 0xffd700, + LightGoldenrod = 0xeedd82, + Goldenrod = 0xdaa520, + DarkGoldenrod = 0xb8860b, + RosyBrown = 0xbc8f8f, + IndianRed = 0xcd5c5c, + SaddleBrown = 0x8b4513, + Sienna = 0xa0522d, + Peru = 0xcd853f, + Burlywood = 0xdeb887, + Beige = 0xf5f5dc, + Wheat = 0xf5deb3, + SandyBrown = 0xf4a460, + Tan = 0xd2b48c, + Chocolate = 0xd2691e, + Firebrick = 0xb22222, + Brown = 0xa52a2a, + DarkSalmon = 0xe9967a, + Salmon = 0xfa8072, + LightSalmon = 0xffa07a, + Orange = 0xffa500, + DarkOrange = 0xff8c00, + Coral = 0xff7f50, + LightCoral = 0xf08080, + Tomato = 0xff6347, + OrangeRed = 0xff4500, + Red = 0xff0000, + HotPink = 0xff69b4, + DeepPink = 0xff1493, + Pink = 0xffc0cb, + LightPink = 0xffb6c1, + PaleVioletRed = 0xdb7093, + Maroon = 0xb03060, + X11Maroon = 0xb03060, + WebMaroon = 0x800000, + MediumVioletRed = 0xc71585, + VioletRed = 0xd02090, + Magenta = 0xff00ff, + Fuchsia = 0xff00ff, + Violet = 0xee82ee, + Plum = 0xdda0dd, + Orchid = 0xda70d6, + MediumOrchid = 0xba55d3, + DarkOrchid = 0x9932cc, + DarkViolet = 0x9400d3, + BlueViolet = 0x8a2be2, + Purple = 0xa020f0, + X11Purple = 0xa020f0, + WebPurple = 0x800080, + MediumPurple = 0x9370db, + Thistle = 0xd8bfd8, + Snow1 = 0xfffafa, + Snow2 = 0xeee9e9, + Snow3 = 0xcdc9c9, + Snow4 = 0x8b8989, + Seashell1 = 0xfff5ee, + Seashell2 = 0xeee5de, + Seashell3 = 0xcdc5bf, + Seashell4 = 0x8b8682, + AntiqueWhite1 = 0xffefdb, + AntiqueWhite2 = 0xeedfcc, + AntiqueWhite3 = 0xcdc0b0, + AntiqueWhite4 = 0x8b8378, + Bisque1 = 0xffe4c4, + Bisque2 = 0xeed5b7, + Bisque3 = 0xcdb79e, + Bisque4 = 0x8b7d6b, + PeachPuff1 = 0xffdab9, + PeachPuff2 = 0xeecbad, + PeachPuff3 = 0xcdaf95, + PeachPuff4 = 0x8b7765, + NavajoWhite1 = 0xffdead, + NavajoWhite2 = 0xeecfa1, + NavajoWhite3 = 0xcdb38b, + NavajoWhite4 = 0x8b795e, + LemonChiffon1 = 0xfffacd, + LemonChiffon2 = 0xeee9bf, + LemonChiffon3 = 0xcdc9a5, + LemonChiffon4 = 0x8b8970, + Cornsilk1 = 0xfff8dc, + Cornsilk2 = 0xeee8cd, + Cornsilk3 = 0xcdc8b1, + Cornsilk4 = 0x8b8878, + Ivory1 = 0xfffff0, + Ivory2 = 0xeeeee0, + Ivory3 = 0xcdcdc1, + Ivory4 = 0x8b8b83, + Honeydew1 = 0xf0fff0, + Honeydew2 = 0xe0eee0, + Honeydew3 = 0xc1cdc1, + Honeydew4 = 0x838b83, + LavenderBlush1 = 0xfff0f5, + LavenderBlush2 = 0xeee0e5, + LavenderBlush3 = 0xcdc1c5, + LavenderBlush4 = 0x8b8386, + MistyRose1 = 0xffe4e1, + MistyRose2 = 0xeed5d2, + MistyRose3 = 0xcdb7b5, + MistyRose4 = 0x8b7d7b, + Azure1 = 0xf0ffff, + Azure2 = 0xe0eeee, + Azure3 = 0xc1cdcd, + Azure4 = 0x838b8b, + SlateBlue1 = 0x836fff, + SlateBlue2 = 0x7a67ee, + SlateBlue3 = 0x6959cd, + SlateBlue4 = 0x473c8b, + RoyalBlue1 = 0x4876ff, + RoyalBlue2 = 0x436eee, + RoyalBlue3 = 0x3a5fcd, + RoyalBlue4 = 0x27408b, + Blue1 = 0x0000ff, + Blue2 = 0x0000ee, + Blue3 = 0x0000cd, + Blue4 = 0x00008b, + DodgerBlue1 = 0x1e90ff, + DodgerBlue2 = 0x1c86ee, + DodgerBlue3 = 0x1874cd, + DodgerBlue4 = 0x104e8b, + SteelBlue1 = 0x63b8ff, + SteelBlue2 = 0x5cacee, + SteelBlue3 = 0x4f94cd, + SteelBlue4 = 0x36648b, + DeepSkyBlue1 = 0x00bfff, + DeepSkyBlue2 = 0x00b2ee, + DeepSkyBlue3 = 0x009acd, + DeepSkyBlue4 = 0x00688b, + SkyBlue1 = 0x87ceff, + SkyBlue2 = 0x7ec0ee, + SkyBlue3 = 0x6ca6cd, + SkyBlue4 = 0x4a708b, + LightSkyBlue1 = 0xb0e2ff, + LightSkyBlue2 = 0xa4d3ee, + LightSkyBlue3 = 0x8db6cd, + LightSkyBlue4 = 0x607b8b, + SlateGray1 = 0xc6e2ff, + SlateGray2 = 0xb9d3ee, + SlateGray3 = 0x9fb6cd, + SlateGray4 = 0x6c7b8b, + LightSteelBlue1 = 0xcae1ff, + LightSteelBlue2 = 0xbcd2ee, + LightSteelBlue3 = 0xa2b5cd, + LightSteelBlue4 = 0x6e7b8b, + LightBlue1 = 0xbfefff, + LightBlue2 = 0xb2dfee, + LightBlue3 = 0x9ac0cd, + LightBlue4 = 0x68838b, + LightCyan1 = 0xe0ffff, + LightCyan2 = 0xd1eeee, + LightCyan3 = 0xb4cdcd, + LightCyan4 = 0x7a8b8b, + PaleTurquoise1 = 0xbbffff, + PaleTurquoise2 = 0xaeeeee, + PaleTurquoise3 = 0x96cdcd, + PaleTurquoise4 = 0x668b8b, + CadetBlue1 = 0x98f5ff, + CadetBlue2 = 0x8ee5ee, + CadetBlue3 = 0x7ac5cd, + CadetBlue4 = 0x53868b, + Turquoise1 = 0x00f5ff, + Turquoise2 = 0x00e5ee, + Turquoise3 = 0x00c5cd, + Turquoise4 = 0x00868b, + Cyan1 = 0x00ffff, + Cyan2 = 0x00eeee, + Cyan3 = 0x00cdcd, + Cyan4 = 0x008b8b, + DarkSlateGray1 = 0x97ffff, + DarkSlateGray2 = 0x8deeee, + DarkSlateGray3 = 0x79cdcd, + DarkSlateGray4 = 0x528b8b, + Aquamarine1 = 0x7fffd4, + Aquamarine2 = 0x76eec6, + Aquamarine3 = 0x66cdaa, + Aquamarine4 = 0x458b74, + DarkSeaGreen1 = 0xc1ffc1, + DarkSeaGreen2 = 0xb4eeb4, + DarkSeaGreen3 = 0x9bcd9b, + DarkSeaGreen4 = 0x698b69, + SeaGreen1 = 0x54ff9f, + SeaGreen2 = 0x4eee94, + SeaGreen3 = 0x43cd80, + SeaGreen4 = 0x2e8b57, + PaleGreen1 = 0x9aff9a, + PaleGreen2 = 0x90ee90, + PaleGreen3 = 0x7ccd7c, + PaleGreen4 = 0x548b54, + SpringGreen1 = 0x00ff7f, + SpringGreen2 = 0x00ee76, + SpringGreen3 = 0x00cd66, + SpringGreen4 = 0x008b45, + Green1 = 0x00ff00, + Green2 = 0x00ee00, + Green3 = 0x00cd00, + Green4 = 0x008b00, + Chartreuse1 = 0x7fff00, + Chartreuse2 = 0x76ee00, + Chartreuse3 = 0x66cd00, + Chartreuse4 = 0x458b00, + OliveDrab1 = 0xc0ff3e, + OliveDrab2 = 0xb3ee3a, + OliveDrab3 = 0x9acd32, + OliveDrab4 = 0x698b22, + DarkOliveGreen1 = 0xcaff70, + DarkOliveGreen2 = 0xbcee68, + DarkOliveGreen3 = 0xa2cd5a, + DarkOliveGreen4 = 0x6e8b3d, + Khaki1 = 0xfff68f, + Khaki2 = 0xeee685, + Khaki3 = 0xcdc673, + Khaki4 = 0x8b864e, + LightGoldenrod1 = 0xffec8b, + LightGoldenrod2 = 0xeedc82, + LightGoldenrod3 = 0xcdbe70, + LightGoldenrod4 = 0x8b814c, + LightYellow1 = 0xffffe0, + LightYellow2 = 0xeeeed1, + LightYellow3 = 0xcdcdb4, + LightYellow4 = 0x8b8b7a, + Yellow1 = 0xffff00, + Yellow2 = 0xeeee00, + Yellow3 = 0xcdcd00, + Yellow4 = 0x8b8b00, + Gold1 = 0xffd700, + Gold2 = 0xeec900, + Gold3 = 0xcdad00, + Gold4 = 0x8b7500, + Goldenrod1 = 0xffc125, + Goldenrod2 = 0xeeb422, + Goldenrod3 = 0xcd9b1d, + Goldenrod4 = 0x8b6914, + DarkGoldenrod1 = 0xffb90f, + DarkGoldenrod2 = 0xeead0e, + DarkGoldenrod3 = 0xcd950c, + DarkGoldenrod4 = 0x8b6508, + RosyBrown1 = 0xffc1c1, + RosyBrown2 = 0xeeb4b4, + RosyBrown3 = 0xcd9b9b, + RosyBrown4 = 0x8b6969, + IndianRed1 = 0xff6a6a, + IndianRed2 = 0xee6363, + IndianRed3 = 0xcd5555, + IndianRed4 = 0x8b3a3a, + Sienna1 = 0xff8247, + Sienna2 = 0xee7942, + Sienna3 = 0xcd6839, + Sienna4 = 0x8b4726, + Burlywood1 = 0xffd39b, + Burlywood2 = 0xeec591, + Burlywood3 = 0xcdaa7d, + Burlywood4 = 0x8b7355, + Wheat1 = 0xffe7ba, + Wheat2 = 0xeed8ae, + Wheat3 = 0xcdba96, + Wheat4 = 0x8b7e66, + Tan1 = 0xffa54f, + Tan2 = 0xee9a49, + Tan3 = 0xcd853f, + Tan4 = 0x8b5a2b, + Chocolate1 = 0xff7f24, + Chocolate2 = 0xee7621, + Chocolate3 = 0xcd661d, + Chocolate4 = 0x8b4513, + Firebrick1 = 0xff3030, + Firebrick2 = 0xee2c2c, + Firebrick3 = 0xcd2626, + Firebrick4 = 0x8b1a1a, + Brown1 = 0xff4040, + Brown2 = 0xee3b3b, + Brown3 = 0xcd3333, + Brown4 = 0x8b2323, + Salmon1 = 0xff8c69, + Salmon2 = 0xee8262, + Salmon3 = 0xcd7054, + Salmon4 = 0x8b4c39, + LightSalmon1 = 0xffa07a, + LightSalmon2 = 0xee9572, + LightSalmon3 = 0xcd8162, + LightSalmon4 = 0x8b5742, + Orange1 = 0xffa500, + Orange2 = 0xee9a00, + Orange3 = 0xcd8500, + Orange4 = 0x8b5a00, + DarkOrange1 = 0xff7f00, + DarkOrange2 = 0xee7600, + DarkOrange3 = 0xcd6600, + DarkOrange4 = 0x8b4500, + Coral1 = 0xff7256, + Coral2 = 0xee6a50, + Coral3 = 0xcd5b45, + Coral4 = 0x8b3e2f, + Tomato1 = 0xff6347, + Tomato2 = 0xee5c42, + Tomato3 = 0xcd4f39, + Tomato4 = 0x8b3626, + OrangeRed1 = 0xff4500, + OrangeRed2 = 0xee4000, + OrangeRed3 = 0xcd3700, + OrangeRed4 = 0x8b2500, + Red1 = 0xff0000, + Red2 = 0xee0000, + Red3 = 0xcd0000, + Red4 = 0x8b0000, + DeepPink1 = 0xff1493, + DeepPink2 = 0xee1289, + DeepPink3 = 0xcd1076, + DeepPink4 = 0x8b0a50, + HotPink1 = 0xff6eb4, + HotPink2 = 0xee6aa7, + HotPink3 = 0xcd6090, + HotPink4 = 0x8b3a62, + Pink1 = 0xffb5c5, + Pink2 = 0xeea9b8, + Pink3 = 0xcd919e, + Pink4 = 0x8b636c, + LightPink1 = 0xffaeb9, + LightPink2 = 0xeea2ad, + LightPink3 = 0xcd8c95, + LightPink4 = 0x8b5f65, + PaleVioletRed1 = 0xff82ab, + PaleVioletRed2 = 0xee799f, + PaleVioletRed3 = 0xcd6889, + PaleVioletRed4 = 0x8b475d, + Maroon1 = 0xff34b3, + Maroon2 = 0xee30a7, + Maroon3 = 0xcd2990, + Maroon4 = 0x8b1c62, + VioletRed1 = 0xff3e96, + VioletRed2 = 0xee3a8c, + VioletRed3 = 0xcd3278, + VioletRed4 = 0x8b2252, + Magenta1 = 0xff00ff, + Magenta2 = 0xee00ee, + Magenta3 = 0xcd00cd, + Magenta4 = 0x8b008b, + Orchid1 = 0xff83fa, + Orchid2 = 0xee7ae9, + Orchid3 = 0xcd69c9, + Orchid4 = 0x8b4789, + Plum1 = 0xffbbff, + Plum2 = 0xeeaeee, + Plum3 = 0xcd96cd, + Plum4 = 0x8b668b, + MediumOrchid1 = 0xe066ff, + MediumOrchid2 = 0xd15fee, + MediumOrchid3 = 0xb452cd, + MediumOrchid4 = 0x7a378b, + DarkOrchid1 = 0xbf3eff, + DarkOrchid2 = 0xb23aee, + DarkOrchid3 = 0x9a32cd, + DarkOrchid4 = 0x68228b, + Purple1 = 0x9b30ff, + Purple2 = 0x912cee, + Purple3 = 0x7d26cd, + Purple4 = 0x551a8b, + MediumPurple1 = 0xab82ff, + MediumPurple2 = 0x9f79ee, + MediumPurple3 = 0x8968cd, + MediumPurple4 = 0x5d478b, + Thistle1 = 0xffe1ff, + Thistle2 = 0xeed2ee, + Thistle3 = 0xcdb5cd, + Thistle4 = 0x8b7b8b, + Gray0 = 0x000000, + Grey0 = 0x000000, + Gray1 = 0x030303, + Grey1 = 0x030303, + Gray2 = 0x050505, + Grey2 = 0x050505, + Gray3 = 0x080808, + Grey3 = 0x080808, + Gray4 = 0x0a0a0a, + Grey4 = 0x0a0a0a, + Gray5 = 0x0d0d0d, + Grey5 = 0x0d0d0d, + Gray6 = 0x0f0f0f, + Grey6 = 0x0f0f0f, + Gray7 = 0x121212, + Grey7 = 0x121212, + Gray8 = 0x141414, + Grey8 = 0x141414, + Gray9 = 0x171717, + Grey9 = 0x171717, + Gray10 = 0x1a1a1a, + Grey10 = 0x1a1a1a, + Gray11 = 0x1c1c1c, + Grey11 = 0x1c1c1c, + Gray12 = 0x1f1f1f, + Grey12 = 0x1f1f1f, + Gray13 = 0x212121, + Grey13 = 0x212121, + Gray14 = 0x242424, + Grey14 = 0x242424, + Gray15 = 0x262626, + Grey15 = 0x262626, + Gray16 = 0x292929, + Grey16 = 0x292929, + Gray17 = 0x2b2b2b, + Grey17 = 0x2b2b2b, + Gray18 = 0x2e2e2e, + Grey18 = 0x2e2e2e, + Gray19 = 0x303030, + Grey19 = 0x303030, + Gray20 = 0x333333, + Grey20 = 0x333333, + Gray21 = 0x363636, + Grey21 = 0x363636, + Gray22 = 0x383838, + Grey22 = 0x383838, + Gray23 = 0x3b3b3b, + Grey23 = 0x3b3b3b, + Gray24 = 0x3d3d3d, + Grey24 = 0x3d3d3d, + Gray25 = 0x404040, + Grey25 = 0x404040, + Gray26 = 0x424242, + Grey26 = 0x424242, + Gray27 = 0x454545, + Grey27 = 0x454545, + Gray28 = 0x474747, + Grey28 = 0x474747, + Gray29 = 0x4a4a4a, + Grey29 = 0x4a4a4a, + Gray30 = 0x4d4d4d, + Grey30 = 0x4d4d4d, + Gray31 = 0x4f4f4f, + Grey31 = 0x4f4f4f, + Gray32 = 0x525252, + Grey32 = 0x525252, + Gray33 = 0x545454, + Grey33 = 0x545454, + Gray34 = 0x575757, + Grey34 = 0x575757, + Gray35 = 0x595959, + Grey35 = 0x595959, + Gray36 = 0x5c5c5c, + Grey36 = 0x5c5c5c, + Gray37 = 0x5e5e5e, + Grey37 = 0x5e5e5e, + Gray38 = 0x616161, + Grey38 = 0x616161, + Gray39 = 0x636363, + Grey39 = 0x636363, + Gray40 = 0x666666, + Grey40 = 0x666666, + Gray41 = 0x696969, + Grey41 = 0x696969, + Gray42 = 0x6b6b6b, + Grey42 = 0x6b6b6b, + Gray43 = 0x6e6e6e, + Grey43 = 0x6e6e6e, + Gray44 = 0x707070, + Grey44 = 0x707070, + Gray45 = 0x737373, + Grey45 = 0x737373, + Gray46 = 0x757575, + Grey46 = 0x757575, + Gray47 = 0x787878, + Grey47 = 0x787878, + Gray48 = 0x7a7a7a, + Grey48 = 0x7a7a7a, + Gray49 = 0x7d7d7d, + Grey49 = 0x7d7d7d, + Gray50 = 0x7f7f7f, + Grey50 = 0x7f7f7f, + Gray51 = 0x828282, + Grey51 = 0x828282, + Gray52 = 0x858585, + Grey52 = 0x858585, + Gray53 = 0x878787, + Grey53 = 0x878787, + Gray54 = 0x8a8a8a, + Grey54 = 0x8a8a8a, + Gray55 = 0x8c8c8c, + Grey55 = 0x8c8c8c, + Gray56 = 0x8f8f8f, + Grey56 = 0x8f8f8f, + Gray57 = 0x919191, + Grey57 = 0x919191, + Gray58 = 0x949494, + Grey58 = 0x949494, + Gray59 = 0x969696, + Grey59 = 0x969696, + Gray60 = 0x999999, + Grey60 = 0x999999, + Gray61 = 0x9c9c9c, + Grey61 = 0x9c9c9c, + Gray62 = 0x9e9e9e, + Grey62 = 0x9e9e9e, + Gray63 = 0xa1a1a1, + Grey63 = 0xa1a1a1, + Gray64 = 0xa3a3a3, + Grey64 = 0xa3a3a3, + Gray65 = 0xa6a6a6, + Grey65 = 0xa6a6a6, + Gray66 = 0xa8a8a8, + Grey66 = 0xa8a8a8, + Gray67 = 0xababab, + Grey67 = 0xababab, + Gray68 = 0xadadad, + Grey68 = 0xadadad, + Gray69 = 0xb0b0b0, + Grey69 = 0xb0b0b0, + Gray70 = 0xb3b3b3, + Grey70 = 0xb3b3b3, + Gray71 = 0xb5b5b5, + Grey71 = 0xb5b5b5, + Gray72 = 0xb8b8b8, + Grey72 = 0xb8b8b8, + Gray73 = 0xbababa, + Grey73 = 0xbababa, + Gray74 = 0xbdbdbd, + Grey74 = 0xbdbdbd, + Gray75 = 0xbfbfbf, + Grey75 = 0xbfbfbf, + Gray76 = 0xc2c2c2, + Grey76 = 0xc2c2c2, + Gray77 = 0xc4c4c4, + Grey77 = 0xc4c4c4, + Gray78 = 0xc7c7c7, + Grey78 = 0xc7c7c7, + Gray79 = 0xc9c9c9, + Grey79 = 0xc9c9c9, + Gray80 = 0xcccccc, + Grey80 = 0xcccccc, + Gray81 = 0xcfcfcf, + Grey81 = 0xcfcfcf, + Gray82 = 0xd1d1d1, + Grey82 = 0xd1d1d1, + Gray83 = 0xd4d4d4, + Grey83 = 0xd4d4d4, + Gray84 = 0xd6d6d6, + Grey84 = 0xd6d6d6, + Gray85 = 0xd9d9d9, + Grey85 = 0xd9d9d9, + Gray86 = 0xdbdbdb, + Grey86 = 0xdbdbdb, + Gray87 = 0xdedede, + Grey87 = 0xdedede, + Gray88 = 0xe0e0e0, + Grey88 = 0xe0e0e0, + Gray89 = 0xe3e3e3, + Grey89 = 0xe3e3e3, + Gray90 = 0xe5e5e5, + Grey90 = 0xe5e5e5, + Gray91 = 0xe8e8e8, + Grey91 = 0xe8e8e8, + Gray92 = 0xebebeb, + Grey92 = 0xebebeb, + Gray93 = 0xededed, + Grey93 = 0xededed, + Gray94 = 0xf0f0f0, + Grey94 = 0xf0f0f0, + Gray95 = 0xf2f2f2, + Grey95 = 0xf2f2f2, + Gray96 = 0xf5f5f5, + Grey96 = 0xf5f5f5, + Gray97 = 0xf7f7f7, + Grey97 = 0xf7f7f7, + Gray98 = 0xfafafa, + Grey98 = 0xfafafa, + Gray99 = 0xfcfcfc, + Grey99 = 0xfcfcfc, + Gray100 = 0xffffff, + Grey100 = 0xffffff, + DarkGrey = 0xa9a9a9, + DarkGray = 0xa9a9a9, + DarkBlue = 0x00008b, + DarkCyan = 0x008b8b, + DarkMagenta = 0x8b008b, + DarkRed = 0x8b0000, + LightGreen = 0x90ee90, + Crimson = 0xdc143c, + Indigo = 0x4b0082, + Olive = 0x808000, + RebeccaPurple = 0x663399, + Silver = 0xc0c0c0, + Teal = 0x008080, +}; +}; +} + +#endif diff --git a/libs/tracy/common/TracyForceInline.hpp b/libs/tracy/common/TracyForceInline.hpp @@ -0,0 +1,20 @@ +#ifndef __TRACYFORCEINLINE_HPP__ +#define __TRACYFORCEINLINE_HPP__ + +#if defined(__GNUC__) +# define tracy_force_inline __attribute__((always_inline)) inline +#elif defined(_MSC_VER) +# define tracy_force_inline __forceinline +#else +# define tracy_force_inline inline +#endif + +#if defined(__GNUC__) +# define tracy_no_inline __attribute__((noinline)) +#elif defined(_MSC_VER) +# define tracy_no_inline __declspec(noinline) +#else +# define tracy_no_inline +#endif + +#endif diff --git a/libs/tracy/common/TracyMutex.hpp b/libs/tracy/common/TracyMutex.hpp @@ -0,0 +1,33 @@ +#ifndef __TRACYMUTEX_HPP__ +#define __TRACYMUTEX_HPP__ + +#if defined _MSC_VER + +# include <shared_mutex> + +namespace tracy +{ +using TracyMutex = std::shared_mutex; +} + +#elif defined __CYGWIN__ + +#include "tracy_benaphore.h" + +namespace tracy +{ +using TracyMutex = NonRecursiveBenaphore; +} + +#else + +#include <mutex> + +namespace tracy +{ +using TracyMutex = std::mutex; +} + +#endif + +#endif diff --git a/libs/tracy/common/TracyProtocol.hpp b/libs/tracy/common/TracyProtocol.hpp @@ -0,0 +1,103 @@ +#ifndef __TRACYPROTOCOL_HPP__ +#define __TRACYPROTOCOL_HPP__ + +#include <limits> +#include <stdint.h> + +namespace tracy +{ + +constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } + +enum : uint32_t { ProtocolVersion = 24 }; +enum : uint32_t { BroadcastVersion = 0 }; + +using lz4sz_t = uint32_t; + +enum { TargetFrameSize = 256 * 1024 }; +enum { LZ4Size = Lz4CompressBound( TargetFrameSize ) }; +static_assert( LZ4Size <= std::numeric_limits<lz4sz_t>::max(), "LZ4Size greater than lz4sz_t" ); +static_assert( TargetFrameSize * 2 >= 64 * 1024, "Not enough space for LZ4 stream buffer" ); + +enum { HandshakeShibbolethSize = 8 }; +static const char HandshakeShibboleth[HandshakeShibbolethSize] = { 'T', 'r', 'a', 'c', 'y', 'P', 'r', 'f' }; + +enum HandshakeStatus : uint8_t +{ + HandshakePending, + HandshakeWelcome, + HandshakeProtocolMismatch, + HandshakeNotAvailable, + HandshakeDropped +}; + +enum { WelcomeMessageProgramNameSize = 64 }; +enum { WelcomeMessageHostInfoSize = 1024 }; + +#pragma pack( 1 ) + +enum ServerQuery : uint8_t +{ + ServerQueryTerminate, + ServerQueryString, + ServerQueryThreadString, + ServerQuerySourceLocation, + ServerQueryPlotName, + ServerQueryCallstackFrame, + ServerQueryFrameName, + ServerQueryDisconnect, + ServerQueryExternalName, + ServerQueryParameter +}; + +struct ServerQueryPacket +{ + ServerQuery type; + uint64_t ptr; +}; + +enum { ServerQueryPacketSize = sizeof( ServerQueryPacket ) }; + + +struct WelcomeMessage +{ + double timerMul; + int64_t initBegin; + int64_t initEnd; + uint64_t delay; + uint64_t resolution; + uint64_t epoch; + uint64_t pid; + uint8_t onDemand; + uint8_t isApple; + char programName[WelcomeMessageProgramNameSize]; + char hostInfo[WelcomeMessageHostInfoSize]; +}; + +enum { WelcomeMessageSize = sizeof( WelcomeMessage ) }; + + +struct OnDemandPayloadMessage +{ + uint64_t frames; + uint64_t currentTime; +}; + +enum { OnDemandPayloadMessageSize = sizeof( OnDemandPayloadMessage ) }; + + +struct BroadcastMessage +{ + uint32_t broadcastVersion; + uint32_t protocolVersion; + uint32_t activeTime; // in seconds + char programName[WelcomeMessageProgramNameSize]; +}; + +enum { BroadcastMessageSize = sizeof( BroadcastMessage ) }; + +#pragma pack() + +} + +#endif diff --git a/libs/tracy/common/TracyQueue.hpp b/libs/tracy/common/TracyQueue.hpp @@ -0,0 +1,500 @@ +#ifndef __TRACYQUEUE_HPP__ +#define __TRACYQUEUE_HPP__ + +#include <stdint.h> + +namespace tracy +{ + +enum class QueueType : uint8_t +{ + ZoneText, + ZoneName, + Message, + MessageColor, + MessageCallstack, + MessageColorCallstack, + MessageAppInfo, + ZoneBeginAllocSrcLoc, + ZoneBeginAllocSrcLocCallstack, + CallstackMemory, + Callstack, + CallstackAlloc, + FrameImage, + ZoneBegin, + ZoneBeginCallstack, + ZoneEnd, + LockWait, + LockObtain, + LockRelease, + LockSharedWait, + LockSharedObtain, + LockSharedRelease, + MemAlloc, + MemFree, + MemAllocCallstack, + MemFreeCallstack, + GpuZoneBegin, + GpuZoneBeginCallstack, + GpuZoneEnd, + GpuZoneBeginSerial, + GpuZoneBeginCallstackSerial, + GpuZoneEndSerial, + PlotData, + ContextSwitch, + ThreadWakeup, + GpuTime, + Terminate, + KeepAlive, + ThreadContext, + Crash, + CrashReport, + ZoneValidation, + FrameMarkMsg, + FrameMarkMsgStart, + FrameMarkMsgEnd, + SourceLocation, + LockAnnounce, + LockTerminate, + LockMark, + MessageLiteral, + MessageLiteralColor, + MessageLiteralCallstack, + MessageLiteralColorCallstack, + GpuNewContext, + CallstackFrameSize, + CallstackFrame, + SysTimeReport, + TidToPid, + PlotConfig, + ParamSetup, + StringData, + ThreadName, + CustomStringData, + PlotName, + SourceLocationPayload, + CallstackPayload, + CallstackAllocPayload, + FrameName, + FrameImageData, + ExternalName, + ExternalThreadName, + NUM_TYPES +}; + +#pragma pack( 1 ) + +struct QueueThreadContext +{ + uint64_t thread; +}; + +struct QueueZoneBegin +{ + int64_t time; + uint64_t srcloc; // ptr +}; + +struct QueueZoneEnd +{ + int64_t time; +}; + +struct QueueZoneValidation +{ + uint32_t id; +}; + +struct QueueStringTransfer +{ + uint64_t ptr; +}; + +struct QueueFrameMark +{ + int64_t time; + uint64_t name; // ptr +}; + +struct QueueFrameImage +{ + uint64_t image; // ptr + uint64_t frame; + uint16_t w; + uint16_t h; + uint8_t flip; +}; + +struct QueueSourceLocation +{ + uint64_t name; + uint64_t function; // ptr + uint64_t file; // ptr + uint32_t line; + uint8_t r; + uint8_t g; + uint8_t b; +}; + +struct QueueZoneText +{ + uint64_t text; // ptr +}; + +enum class LockType : uint8_t +{ + Lockable, + SharedLockable +}; + +struct QueueLockAnnounce +{ + uint32_t id; + int64_t time; + uint64_t lckloc; // ptr + LockType type; +}; + +struct QueueLockTerminate +{ + uint32_t id; + int64_t time; + LockType type; +}; + +struct QueueLockWait +{ + uint64_t thread; + uint32_t id; + int64_t time; + LockType type; +}; + +struct QueueLockObtain +{ + uint64_t thread; + uint32_t id; + int64_t time; +}; + +struct QueueLockRelease +{ + uint64_t thread; + uint32_t id; + int64_t time; +}; + +struct QueueLockMark +{ + uint64_t thread; + uint32_t id; + uint64_t srcloc; // ptr +}; + +enum class PlotDataType : uint8_t +{ + Float, + Double, + Int +}; + +struct QueuePlotData +{ + uint64_t name; // ptr + int64_t time; + PlotDataType type; + union + { + double d; + float f; + int64_t i; + } data; +}; + +struct QueueMessage +{ + int64_t time; + uint64_t text; // ptr +}; + +struct QueueMessageColor : public QueueMessage +{ + uint8_t r; + uint8_t g; + uint8_t b; +}; + +struct QueueGpuNewContext +{ + int64_t cpuTime; + int64_t gpuTime; + uint64_t thread; + float period; + uint8_t context; + uint8_t accuracyBits; +}; + +struct QueueGpuZoneBegin +{ + int64_t cpuTime; + uint64_t srcloc; + uint64_t thread; + uint16_t queryId; + uint8_t context; +}; + +struct QueueGpuZoneEnd +{ + int64_t cpuTime; + uint64_t thread; + uint16_t queryId; + uint8_t context; +}; + +struct QueueGpuTime +{ + int64_t gpuTime; + uint16_t queryId; + uint8_t context; +}; + +struct QueueMemAlloc +{ + int64_t time; + uint64_t thread; + uint64_t ptr; + char size[6]; +}; + +struct QueueMemFree +{ + int64_t time; + uint64_t thread; + uint64_t ptr; +}; + +struct QueueCallstackMemory +{ + uint64_t ptr; +}; + +struct QueueCallstack +{ + uint64_t ptr; +}; + +struct QueueCallstackAlloc +{ + uint64_t ptr; + uint64_t nativePtr; +}; + +struct QueueCallstackFrameSize +{ + uint64_t ptr; + uint8_t size; +}; + +struct QueueCallstackFrame +{ + uint64_t name; + uint64_t file; + uint32_t line; +}; + +struct QueueCrashReport +{ + int64_t time; + uint64_t text; // ptr +}; + +struct QueueSysTime +{ + int64_t time; + float sysTime; +}; + +struct QueueContextSwitch +{ + int64_t time; + uint64_t oldThread; + uint64_t newThread; + uint8_t cpu; + uint8_t reason; + uint8_t state; +}; + +struct QueueThreadWakeup +{ + int64_t time; + uint64_t thread; +}; + +struct QueueTidToPid +{ + uint64_t tid; + uint64_t pid; +}; + +enum class PlotFormatType : uint8_t +{ + Number, + Memory, + Percentage +}; + +struct QueuePlotConfig +{ + uint64_t name; // ptr + uint8_t type; +}; + +struct QueueParamSetup +{ + uint32_t idx; + uint64_t name; // ptr + uint8_t isBool; + int32_t val; +}; + +struct QueueHeader +{ + union + { + QueueType type; + uint8_t idx; + }; +}; + +struct QueueItem +{ + QueueHeader hdr; + union + { + QueueThreadContext threadCtx; + QueueZoneBegin zoneBegin; + QueueZoneEnd zoneEnd; + QueueZoneValidation zoneValidation; + QueueStringTransfer stringTransfer; + QueueFrameMark frameMark; + QueueFrameImage frameImage; + QueueSourceLocation srcloc; + QueueZoneText zoneText; + QueueLockAnnounce lockAnnounce; + QueueLockTerminate lockTerminate; + QueueLockWait lockWait; + QueueLockObtain lockObtain; + QueueLockRelease lockRelease; + QueueLockMark lockMark; + QueuePlotData plotData; + QueueMessage message; + QueueMessageColor messageColor; + QueueGpuNewContext gpuNewContext; + QueueGpuZoneBegin gpuZoneBegin; + QueueGpuZoneEnd gpuZoneEnd; + QueueGpuTime gpuTime; + QueueMemAlloc memAlloc; + QueueMemFree memFree; + QueueCallstackMemory callstackMemory; + QueueCallstack callstack; + QueueCallstackAlloc callstackAlloc; + QueueCallstackFrameSize callstackFrameSize; + QueueCallstackFrame callstackFrame; + QueueCrashReport crashReport; + QueueSysTime sysTime; + QueueContextSwitch contextSwitch; + QueueThreadWakeup threadWakeup; + QueueTidToPid tidToPid; + QueuePlotConfig plotConfig; + QueueParamSetup paramSetup; + }; +}; +#pragma pack() + + +enum { QueueItemSize = sizeof( QueueItem ) }; + +static const size_t QueueDataSize[] = { + sizeof( QueueHeader ) + sizeof( QueueZoneText ), + sizeof( QueueHeader ) + sizeof( QueueZoneText ), // zone name + sizeof( QueueHeader ) + sizeof( QueueMessage ), + sizeof( QueueHeader ) + sizeof( QueueMessageColor ), + sizeof( QueueHeader ) + sizeof( QueueMessage ), // callstack + sizeof( QueueHeader ) + sizeof( QueueMessageColor ), // callstack + sizeof( QueueHeader ) + sizeof( QueueMessage ), // app info + sizeof( QueueHeader ) + sizeof( QueueZoneBegin ), // allocated source location + sizeof( QueueHeader ) + sizeof( QueueZoneBegin ), // allocated source location, callstack + sizeof( QueueHeader ) + sizeof( QueueCallstackMemory ), + sizeof( QueueHeader ) + sizeof( QueueCallstack ), + sizeof( QueueHeader ) + sizeof( QueueCallstackAlloc ), + sizeof( QueueHeader ) + sizeof( QueueFrameImage ), + sizeof( QueueHeader ) + sizeof( QueueZoneBegin ), + sizeof( QueueHeader ) + sizeof( QueueZoneBegin ), // callstack + sizeof( QueueHeader ) + sizeof( QueueZoneEnd ), + sizeof( QueueHeader ) + sizeof( QueueLockWait ), + sizeof( QueueHeader ) + sizeof( QueueLockObtain ), + sizeof( QueueHeader ) + sizeof( QueueLockRelease ), + sizeof( QueueHeader ) + sizeof( QueueLockWait ), // shared + sizeof( QueueHeader ) + sizeof( QueueLockObtain ), // shared + sizeof( QueueHeader ) + sizeof( QueueLockRelease ), // shared + sizeof( QueueHeader ) + sizeof( QueueMemAlloc ), + sizeof( QueueHeader ) + sizeof( QueueMemFree ), + sizeof( QueueHeader ) + sizeof( QueueMemAlloc ), // callstack + sizeof( QueueHeader ) + sizeof( QueueMemFree ), // callstack + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ), + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ), // callstack + sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ), + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ), // serial + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ), // serial, callstack + sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ), // serial + sizeof( QueueHeader ) + sizeof( QueuePlotData ), + sizeof( QueueHeader ) + sizeof( QueueContextSwitch ), + sizeof( QueueHeader ) + sizeof( QueueThreadWakeup ), + sizeof( QueueHeader ) + sizeof( QueueGpuTime ), + // above items must be first + sizeof( QueueHeader ), // terminate + sizeof( QueueHeader ), // keep alive + sizeof( QueueHeader ) + sizeof( QueueThreadContext ), + sizeof( QueueHeader ), // crash + sizeof( QueueHeader ) + sizeof( QueueCrashReport ), + sizeof( QueueHeader ) + sizeof( QueueZoneValidation ), + sizeof( QueueHeader ) + sizeof( QueueFrameMark ), // continuous frames + sizeof( QueueHeader ) + sizeof( QueueFrameMark ), // start + sizeof( QueueHeader ) + sizeof( QueueFrameMark ), // end + sizeof( QueueHeader ) + sizeof( QueueSourceLocation ), + sizeof( QueueHeader ) + sizeof( QueueLockAnnounce ), + sizeof( QueueHeader ) + sizeof( QueueLockTerminate ), + sizeof( QueueHeader ) + sizeof( QueueLockMark ), + sizeof( QueueHeader ) + sizeof( QueueMessage ), // literal + sizeof( QueueHeader ) + sizeof( QueueMessageColor ), // literal + sizeof( QueueHeader ) + sizeof( QueueMessage ), // literal, callstack + sizeof( QueueHeader ) + sizeof( QueueMessageColor ), // literal, callstack + sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ), + sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ), + sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ), + sizeof( QueueHeader ) + sizeof( QueueSysTime ), + sizeof( QueueHeader ) + sizeof( QueueTidToPid ), + sizeof( QueueHeader ) + sizeof( QueuePlotConfig ), + sizeof( QueueHeader ) + sizeof( QueueParamSetup ), + // keep all QueueStringTransfer below + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // string data + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // thread name + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // custom string data + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // plot name + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // allocated source location payload + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // callstack payload + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // callstack alloc payload + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // frame name + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // frame image data + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // external name + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // external thread name +}; + +static_assert( QueueItemSize == 32, "Queue item size not 32 bytes" ); +static_assert( sizeof( QueueDataSize ) / sizeof( size_t ) == (uint8_t)QueueType::NUM_TYPES, "QueueDataSize mismatch" ); +static_assert( sizeof( void* ) <= sizeof( uint64_t ), "Pointer size > 8 bytes" ); +static_assert( sizeof( void* ) == sizeof( uintptr_t ), "Pointer size != uintptr_t" ); + +}; + +#endif diff --git a/libs/tracy/common/TracySocket.cpp b/libs/tracy/common/TracySocket.cpp @@ -0,0 +1,561 @@ +#include <assert.h> +#include <new> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> + +#include "TracyAlloc.hpp" +#include "TracySocket.hpp" + +#ifdef _WIN32 +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include <winsock2.h> +# include <ws2tcpip.h> +# ifdef _MSC_VER +# pragma warning(disable:4244) +# pragma warning(disable:4267) +# endif +# define poll WSAPoll +#else +# include <arpa/inet.h> +# include <sys/socket.h> +# include <sys/param.h> +# include <netinet/in.h> +# include <netdb.h> +# include <unistd.h> +# include <poll.h> +#endif + +#ifndef MSG_NOSIGNAL +# define MSG_NOSIGNAL 0 +#endif + +namespace tracy +{ + +#ifdef _WIN32 +typedef SOCKET socket_t; +#else +typedef int socket_t; +#endif + +#ifdef _WIN32 +struct __wsinit +{ + __wsinit() + { + WSADATA wsaData; + if( WSAStartup( MAKEWORD( 2, 2 ), &wsaData ) != 0 ) + { + fprintf( stderr, "Cannot init winsock.\n" ); + exit( 1 ); + } + } +}; + +void InitWinSock() +{ + static __wsinit init; +} +#endif + +Socket::Socket() + : m_buf( (char*)tracy_malloc( BufSize ) ) + , m_bufPtr( nullptr ) + , m_sock( -1 ) + , m_bufLeft( 0 ) +{ +#ifdef _WIN32 + InitWinSock(); +#endif +} + +Socket::Socket( int sock ) + : m_buf( (char*)tracy_malloc( BufSize ) ) + , m_bufPtr( nullptr ) + , m_sock( sock ) + , m_bufLeft( 0 ) +{ +} + +Socket::~Socket() +{ + tracy_free( m_buf ); + if( m_sock != -1 ) + { + Close(); + } +} + +bool Socket::Connect( const char* addr, int port ) +{ + assert( m_sock == -1 ); + + struct addrinfo hints; + struct addrinfo *res, *ptr; + + memset( &hints, 0, sizeof( hints ) ); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + + char portbuf[32]; + sprintf( portbuf, "%i", port ); + + if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false; + int sock = 0; + for( ptr = res; ptr; ptr = ptr->ai_next ) + { + if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue; +#if defined __APPLE__ + int val = 1; + setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) ); +#endif + if( connect( sock, ptr->ai_addr, ptr->ai_addrlen ) == -1 ) + { +#ifdef _WIN32 + closesocket( sock ); +#else + close( sock ); +#endif + continue; + } + break; + } + freeaddrinfo( res ); + if( !ptr ) return false; + + m_sock = sock; + return true; +} + +void Socket::Close() +{ + assert( m_sock != -1 ); +#ifdef _WIN32 + closesocket( m_sock ); +#else + close( m_sock ); +#endif + m_sock = -1; +} + +int Socket::Send( const void* _buf, int len ) +{ + auto buf = (const char*)_buf; + assert( m_sock != -1 ); + auto start = buf; + while( len > 0 ) + { + auto ret = send( m_sock, buf, len, MSG_NOSIGNAL ); + if( ret == -1 ) return -1; + len -= ret; + buf += ret; + } + return int( buf - start ); +} + +int Socket::GetSendBufSize() +{ + int bufSize; +#if defined _WIN32 || defined __CYGWIN__ + int sz = sizeof( bufSize ); + getsockopt( m_sock, SOL_SOCKET, SO_SNDBUF, (char*)&bufSize, &sz ); +#else + socklen_t sz = sizeof( bufSize ); + getsockopt( m_sock, SOL_SOCKET, SO_SNDBUF, &bufSize, &sz ); +#endif + return bufSize; +} + +int Socket::RecvBuffered( void* buf, int len, int timeout ) +{ + if( len <= m_bufLeft ) + { + memcpy( buf, m_bufPtr, len ); + m_bufPtr += len; + m_bufLeft -= len; + return len; + } + + if( m_bufLeft > 0 ) + { + memcpy( buf, m_bufPtr, m_bufLeft ); + const auto ret = m_bufLeft; + m_bufLeft = 0; + return ret; + } + + if( len >= BufSize ) return Recv( buf, len, timeout ); + + m_bufLeft = Recv( m_buf, BufSize, timeout ); + if( m_bufLeft <= 0 ) return m_bufLeft; + + const auto sz = len < m_bufLeft ? len : m_bufLeft; + memcpy( buf, m_buf, sz ); + m_bufPtr = m_buf + sz; + m_bufLeft -= sz; + return sz; +} + +int Socket::Recv( void* _buf, int len, int timeout ) +{ + auto buf = (char*)_buf; + + struct pollfd fd; + fd.fd = (socket_t)m_sock; + fd.events = POLLIN; + + if( poll( &fd, 1, timeout ) > 0 ) + { + return recv( m_sock, buf, len, 0 ); + } + else + { + return -1; + } +} + +bool Socket::Read( void* _buf, int len, int timeout, std::function<bool()> exitCb ) +{ + auto buf = (char*)_buf; + + while( len > 0 ) + { + if( exitCb() ) return false; + const auto sz = RecvBuffered( buf, len, timeout ); + switch( sz ) + { + case 0: + return false; + case -1: +#ifdef _WIN32 + { + auto err = WSAGetLastError(); + if( err == WSAECONNABORTED || err == WSAECONNRESET ) return false; + } +#endif + break; + default: + len -= sz; + buf += sz; + break; + } + } + + return true; +} + +bool Socket::ReadRaw( void* _buf, int len, int timeout ) +{ + auto buf = (char*)_buf; + while( len > 0 ) + { + const auto sz = Recv( buf, len, timeout ); + if( sz <= 0 ) return false; + len -= sz; + buf += sz; + } + return true; +} + +bool Socket::HasData() +{ + if( m_bufLeft > 0 ) return true; + + struct pollfd fd; + fd.fd = (socket_t)m_sock; + fd.events = POLLIN; + + return poll( &fd, 1, 0 ) > 0; +} + + +ListenSocket::ListenSocket() + : m_sock( -1 ) +{ +#ifdef _WIN32 + InitWinSock(); +#endif +} + +ListenSocket::~ListenSocket() +{ + if( m_sock != -1 ) Close(); +} + +bool ListenSocket::Listen( int port, int backlog ) +{ + assert( m_sock == -1 ); + + struct addrinfo* res; + struct addrinfo hints; + + memset( &hints, 0, sizeof( hints ) ); + hints.ai_family = AF_INET6; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_PASSIVE; + + char portbuf[32]; + sprintf( portbuf, "%i", port ); + + if( getaddrinfo( nullptr, portbuf, &hints, &res ) != 0 ) return false; + + m_sock = socket( res->ai_family, res->ai_socktype, res->ai_protocol ); +#if defined _WIN32 || defined __CYGWIN__ + unsigned long val = 0; + setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) ); +#elif defined BSD + int val = 0; + setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) ); + val = 1; + setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof( val ) ); +#else + int val = 1; + setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof( val ) ); +#endif + if( bind( m_sock, res->ai_addr, res->ai_addrlen ) == -1 ) { freeaddrinfo( res ); return false; } + if( listen( m_sock, backlog ) == -1 ) { freeaddrinfo( res ); return false; } + freeaddrinfo( res ); + return true; +} + +Socket* ListenSocket::Accept() +{ + struct sockaddr_storage remote; + socklen_t sz = sizeof( remote ); + + struct pollfd fd; + fd.fd = (socket_t)m_sock; + fd.events = POLLIN; + + if( poll( &fd, 1, 10 ) > 0 ) + { + int sock = accept( m_sock, (sockaddr*)&remote, &sz); + if( sock == -1 ) return nullptr; + +#if defined __APPLE__ + int val = 1; + setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) ); +#endif + + auto ptr = (Socket*)tracy_malloc( sizeof( Socket ) ); + new(ptr) Socket( sock ); + return ptr; + } + else + { + return nullptr; + } +} + +void ListenSocket::Close() +{ + assert( m_sock != -1 ); +#ifdef _WIN32 + closesocket( m_sock ); +#else + close( m_sock ); +#endif + m_sock = -1; +} + +UdpBroadcast::UdpBroadcast() + : m_sock( -1 ) +{ +#ifdef _WIN32 + InitWinSock(); +#endif +} + +UdpBroadcast::~UdpBroadcast() +{ + if( m_sock != -1 ) Close(); +} + +bool UdpBroadcast::Open( const char* addr, int port ) +{ + assert( m_sock == -1 ); + + struct addrinfo hints; + struct addrinfo *res, *ptr; + + memset( &hints, 0, sizeof( hints ) ); + hints.ai_family = AF_INET; + hints.ai_socktype = SOCK_DGRAM; + + char portbuf[32]; + sprintf( portbuf, "%i", port ); + + if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false; + int sock = 0; + for( ptr = res; ptr; ptr = ptr->ai_next ) + { + if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue; +#if defined __APPLE__ + int val = 1; + setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) ); +#endif +#if defined _WIN32 || defined __CYGWIN__ + unsigned long broadcast = 1; + if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, (const char*)&broadcast, sizeof( broadcast ) ) == -1 ) +#else + int broadcast = 1; + if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof( broadcast ) ) == -1 ) +#endif + { +#ifdef _WIN32 + closesocket( sock ); +#else + close( sock ); +#endif + continue; + } + break; + } + freeaddrinfo( res ); + if( !ptr ) return false; + + m_sock = sock; + return true; +} + +void UdpBroadcast::Close() +{ + assert( m_sock != -1 ); +#ifdef _WIN32 + closesocket( m_sock ); +#else + close( m_sock ); +#endif + m_sock = -1; +} + +int UdpBroadcast::Send( int port, const void* data, int len ) +{ + assert( m_sock != -1 ); + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_port = htons( port ); + addr.sin_addr.s_addr = INADDR_BROADCAST; + return sendto( m_sock, (const char*)data, len, MSG_NOSIGNAL, (sockaddr*)&addr, sizeof( addr ) ); +} + +IpAddress::IpAddress() + : m_number( 0 ) +{ + *m_text = '\0'; +} + +IpAddress::~IpAddress() +{ +} + +void IpAddress::Set( const struct sockaddr& addr ) +{ +#if __MINGW32__ + auto ai = (struct sockaddr_in*)&addr; +#else + auto ai = (const struct sockaddr_in*)&addr; +#endif + inet_ntop( AF_INET, &ai->sin_addr, m_text, 17 ); + m_number = ai->sin_addr.s_addr; +} + +UdpListen::UdpListen() + : m_sock( -1 ) +{ +#ifdef _WIN32 + InitWinSock(); +#endif +} + +UdpListen::~UdpListen() +{ + if( m_sock != -1 ) Close(); +} + +bool UdpListen::Listen( int port ) +{ + assert( m_sock == -1 ); + + int sock; + if( ( sock = socket( AF_INET, SOCK_DGRAM, 0 ) ) == -1 ) return false; + +#if defined __APPLE__ + int val = 1; + setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) ); +#endif +#if defined _WIN32 || defined __CYGWIN__ + unsigned long reuse = 1; + setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) ); +#else + int reuse = 1; + setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) ); +#endif +#if defined _WIN32 || defined __CYGWIN__ + unsigned long broadcast = 1; + if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, (const char*)&broadcast, sizeof( broadcast ) ) == -1 ) +#else + int broadcast = 1; + if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof( broadcast ) ) == -1 ) +#endif + { +#ifdef _WIN32 + closesocket( sock ); +#else + close( sock ); +#endif + return false; + } + + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_port = htons( port ); + addr.sin_addr.s_addr = INADDR_ANY; + + if( bind( sock, (sockaddr*)&addr, sizeof( addr ) ) == -1 ) + { +#ifdef _WIN32 + closesocket( sock ); +#else + close( sock ); +#endif + return false; + } + + m_sock = sock; + return true; +} + +void UdpListen::Close() +{ + assert( m_sock != -1 ); +#ifdef _WIN32 + closesocket( m_sock ); +#else + close( m_sock ); +#endif + m_sock = -1; +} + +const char* UdpListen::Read( size_t& len, IpAddress& addr ) +{ + static char buf[2048]; + + struct pollfd fd; + fd.fd = (socket_t)m_sock; + fd.events = POLLIN; + if( poll( &fd, 1, 10 ) <= 0 ) return nullptr; + + sockaddr sa; + socklen_t salen = sizeof( struct sockaddr ); + len = (size_t)recvfrom( m_sock, buf, 2048, 0, &sa, &salen ); + addr.Set( sa ); + + return buf; +} + +} diff --git a/libs/tracy/common/TracySocket.hpp b/libs/tracy/common/TracySocket.hpp @@ -0,0 +1,131 @@ +#ifndef __TRACYSOCKET_HPP__ +#define __TRACYSOCKET_HPP__ + +#include <functional> + +struct sockaddr; + +namespace tracy +{ + +#ifdef _WIN32 +void InitWinSock(); +#endif + +class Socket +{ + enum { BufSize = 128 * 1024 }; + +public: + Socket(); + Socket( int sock ); + ~Socket(); + + bool Connect( const char* addr, int port ); + void Close(); + + int Send( const void* buf, int len ); + int GetSendBufSize(); + + bool Read( void* buf, int len, int timeout, std::function<bool()> exitCb ); + bool ReadRaw( void* buf, int len, int timeout ); + bool HasData(); + + Socket( const Socket& ) = delete; + Socket( Socket&& ) = delete; + Socket& operator=( const Socket& ) = delete; + Socket& operator=( Socket&& ) = delete; + +private: + int RecvBuffered( void* buf, int len, int timeout ); + int Recv( void* buf, int len, int timeout ); + + char* m_buf; + char* m_bufPtr; + int m_sock; + int m_bufLeft; +}; + +class ListenSocket +{ +public: + ListenSocket(); + ~ListenSocket(); + + bool Listen( int port, int backlog ); + Socket* Accept(); + void Close(); + + ListenSocket( const ListenSocket& ) = delete; + ListenSocket( ListenSocket&& ) = delete; + ListenSocket& operator=( const ListenSocket& ) = delete; + ListenSocket& operator=( ListenSocket&& ) = delete; + +private: + int m_sock; +}; + +class UdpBroadcast +{ +public: + UdpBroadcast(); + ~UdpBroadcast(); + + bool Open( const char* addr, int port ); + void Close(); + + int Send( int port, const void* data, int len ); + + UdpBroadcast( const UdpBroadcast& ) = delete; + UdpBroadcast( UdpBroadcast&& ) = delete; + UdpBroadcast& operator=( const UdpBroadcast& ) = delete; + UdpBroadcast& operator=( UdpBroadcast&& ) = delete; + +private: + int m_sock; +}; + +class IpAddress +{ +public: + IpAddress(); + ~IpAddress(); + + void Set( const struct sockaddr& addr ); + + uint32_t GetNumber() const { return m_number; } + const char* GetText() const { return m_text; } + + IpAddress( const IpAddress& ) = delete; + IpAddress( IpAddress&& ) = delete; + IpAddress& operator=( const IpAddress& ) = delete; + IpAddress& operator=( IpAddress&& ) = delete; + +private: + uint32_t m_number; + char m_text[17]; +}; + +class UdpListen +{ +public: + UdpListen(); + ~UdpListen(); + + bool Listen( int port ); + void Close(); + + const char* Read( size_t& len, IpAddress& addr ); + + UdpListen( const UdpListen& ) = delete; + UdpListen( UdpListen&& ) = delete; + UdpListen& operator=( const UdpListen& ) = delete; + UdpListen& operator=( UdpListen&& ) = delete; + +private: + int m_sock; +}; + +} + +#endif diff --git a/libs/tracy/common/TracySystem.cpp b/libs/tracy/common/TracySystem.cpp @@ -0,0 +1,187 @@ +#if defined _MSC_VER || defined __CYGWIN__ || defined _WIN32 +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# ifndef NOMINMAX +# define NOMINMAX +# endif +#endif +#if defined _WIN32 || defined __CYGWIN__ +# include <windows.h> +#else +# include <pthread.h> +# include <string.h> +# include <unistd.h> +#endif + +#ifdef __linux__ +# ifndef __ANDROID__ +# include <syscall.h> +# endif +# include <fcntl.h> +#endif + +#ifdef __MINGW32__ +# define __STDC_FORMAT_MACROS +#endif +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> + +#include "TracySystem.hpp" + +#ifdef TRACY_ENABLE +# include <atomic> +# include "TracyAlloc.hpp" +#endif + +namespace tracy +{ + +#ifdef TRACY_ENABLE +struct ThreadNameData +{ + uint64_t id; + const char* name; + ThreadNameData* next; +}; +TRACY_API std::atomic<ThreadNameData*>& GetThreadNameData(); +TRACY_API void InitRPMallocThread(); +#endif + +void SetThreadName( const char* name ) +{ +#if defined _WIN32 || defined __CYGWIN__ +# if defined NTDDI_WIN10_RS2 && NTDDI_VERSION >= NTDDI_WIN10_RS2 + wchar_t buf[256]; + mbstowcs( buf, name, 256 ); + SetThreadDescription( GetCurrentThread(), buf ); +# elif defined _MSC_VER + const DWORD MS_VC_EXCEPTION=0x406D1388; +# pragma pack( push, 8 ) + struct THREADNAME_INFO + { + DWORD dwType; + LPCSTR szName; + DWORD dwThreadID; + DWORD dwFlags; + }; +# pragma pack(pop) + + DWORD ThreadId = GetCurrentThreadId(); + THREADNAME_INFO info; + info.dwType = 0x1000; + info.szName = name; + info.dwThreadID = ThreadId; + info.dwFlags = 0; + + __try + { + RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info ); + } + __except(EXCEPTION_EXECUTE_HANDLER) + { + } +# endif +#elif defined _GNU_SOURCE && !defined __EMSCRIPTEN__ && !defined __CYGWIN__ + { + const auto sz = strlen( name ); + if( sz <= 15 ) + { + pthread_setname_np( pthread_self(), name ); + } + else + { + char buf[16]; + memcpy( buf, name, 15 ); + buf[15] = '\0'; + pthread_setname_np( pthread_self(), buf ); + } + } +#endif +#ifdef TRACY_ENABLE + { + InitRPMallocThread(); + const auto sz = strlen( name ); + char* buf = (char*)tracy_malloc( sz+1 ); + memcpy( buf, name, sz ); + buf[sz+1] = '\0'; + auto data = (ThreadNameData*)tracy_malloc( sizeof( ThreadNameData ) ); + data->id = detail::GetThreadHandleImpl(); + data->name = buf; + data->next = GetThreadNameData().load( std::memory_order_relaxed ); + while( !GetThreadNameData().compare_exchange_weak( data->next, data, std::memory_order_release, std::memory_order_relaxed ) ) {} + } +#endif +} + +const char* GetThreadName( uint64_t id ) +{ + static char buf[256]; +#ifdef TRACY_ENABLE + auto ptr = GetThreadNameData().load( std::memory_order_relaxed ); + while( ptr ) + { + if( ptr->id == id ) + { + return ptr->name; + } + ptr = ptr->next; + } +#else +# if defined _WIN32 || defined __CYGWIN__ +# if defined NTDDI_WIN10_RS2 && NTDDI_VERSION >= NTDDI_WIN10_RS2 + auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id ); + if( hnd != 0 ) + { + PWSTR tmp; + GetThreadDescription( hnd, &tmp ); + auto ret = wcstombs( buf, tmp, 256 ); + CloseHandle( hnd ); + if( ret != 0 ) + { + return buf; + } + } +# endif +# elif defined __GLIBC__ && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined __CYGWIN__ + if( pthread_getname_np( (pthread_t)id, buf, 256 ) == 0 ) + { + return buf; + } +# elif defined __linux__ + int cs, fd; + char path[32]; +# ifdef __ANDROID__ + int tid = gettid(); +# else + int tid = (int) syscall( SYS_gettid ); +# endif + snprintf( path, sizeof( path ), "/proc/self/task/%d/comm", tid ); + sprintf( buf, "%" PRIu64, id ); +# ifndef __ANDROID__ + pthread_setcancelstate( PTHREAD_CANCEL_DISABLE, &cs ); +# endif + if ( ( fd = open( path, O_RDONLY ) ) > 0) { + int len = read( fd, buf, 255 ); + if( len > 0 ) + { + buf[len] = 0; + if( len > 1 && buf[len-1] == '\n' ) + { + buf[len-1] = 0; + } + } + close( fd ); + } +# ifndef __ANDROID__ + pthread_setcancelstate( cs, 0 ); +# endif + return buf; +# endif +#endif + sprintf( buf, "%" PRIu64, id ); + return buf; +} + +} diff --git a/libs/tracy/common/TracySystem.hpp b/libs/tracy/common/TracySystem.hpp @@ -0,0 +1,80 @@ +#ifndef __TRACYSYSTEM_HPP__ +#define __TRACYSYSTEM_HPP__ + +#if defined _WIN32 || defined __CYGWIN__ +# ifndef _WINDOWS_ +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); +# endif +#elif defined __APPLE__ || ( !defined __ANDROID__ && !defined __linux__ ) +# include <pthread.h> +#endif + +#ifdef __linux__ +# include <unistd.h> +# ifdef __ANDROID__ +# include <sys/types.h> +# else +# include <sys/syscall.h> +# endif +#elif defined __FreeBSD__ +# include <sys/thr.h> +#elif defined __NetBSD__ || defined __DragonFly__ +# include <sys/lwp.h> +#elif defined __OpenBSD__ +# include <unistd.h> +#endif + +#include <stdint.h> + +#include "TracyApi.h" + +namespace tracy +{ + +namespace detail +{ +static inline uint64_t GetThreadHandleImpl() +{ +#if defined _WIN32 || defined __CYGWIN__ + static_assert( sizeof( decltype( GetCurrentThreadId() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" ); + return uint64_t( GetCurrentThreadId() ); +#elif defined __APPLE__ + uint64_t id; + pthread_threadid_np( pthread_self(), &id ); + return id; +#elif defined __ANDROID__ + return (uint64_t)gettid(); +#elif defined __linux__ + return (uint64_t)syscall( SYS_gettid ); +#elif defined __FreeBSD__ + long id; + thr_self( &id ); + return id; +#elif defined __NetBSD__ + return _lwp_self(); +#elif defined __DragonFly__ + return lwp_gettid(); +#elif defined __OpenBSD__ + return getthrid(); +#else + static_assert( sizeof( decltype( pthread_self() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" ); + return uint64_t( pthread_self() ); +#endif +} +} + +#ifdef TRACY_ENABLE +TRACY_API uint64_t GetThreadHandle(); +#else +static inline uint64_t GetThreadHandle() +{ + return detail::GetThreadHandleImpl(); +} +#endif + +void SetThreadName( const char* name ); +const char* GetThreadName( uint64_t id ); + +} + +#endif diff --git a/libs/tracy/common/tracy_benaphore.h b/libs/tracy/common/tracy_benaphore.h @@ -0,0 +1,68 @@ +// Copyright (c) 2015 Jeff Preshing +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. + +#ifndef __TRACY_CPP11OM_BENAPHORE_H__ +#define __TRACY_CPP11OM_BENAPHORE_H__ + +#include <cassert> +#include <thread> +#include <atomic> +#include "tracy_sema.h" + +namespace tracy +{ + +class NonRecursiveBenaphore +{ +private: + std::atomic<int> m_contentionCount; + DefaultSemaphoreType m_sema; + +public: + NonRecursiveBenaphore() : m_contentionCount(0) {} + + void lock() + { + if (m_contentionCount.fetch_add(1, std::memory_order_acquire) > 0) + { + m_sema.wait(); + } + } + + bool try_lock() + { + if (m_contentionCount.load(std::memory_order_relaxed) != 0) + return false; + int expected = 0; + return m_contentionCount.compare_exchange_strong(expected, 1, std::memory_order_acquire); + } + + void unlock() + { + int oldCount = m_contentionCount.fetch_sub(1, std::memory_order_release); + assert(oldCount > 0); + if (oldCount > 1) + { + m_sema.signal(); + } + } +}; + +} + +#endif // __CPP11OM_BENAPHORE_H__ diff --git a/libs/tracy/common/tracy_lz4.cpp b/libs/tracy/common/tracy_lz4.cpp @@ -0,0 +1,2297 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-present, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ + +/*-************************************ +* Tuning parameters +**************************************/ +/* + * LZ4_HEAPMODE : + * Select how default compression functions will allocate memory for their hash table, + * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). + */ +#ifndef LZ4_HEAPMODE +# define LZ4_HEAPMODE 0 +#endif + +/* + * ACCELERATION_DEFAULT : + * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0 + */ +#define ACCELERATION_DEFAULT 1 + + +/*-************************************ +* CPU Feature Detection +**************************************/ +/* LZ4_FORCE_MEMORY_ACCESS + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method is portable but violate C standard. + * It can generate buggy code on targets which assembly generation depends on alignment. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */ +# if defined(__GNUC__) && \ + ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define LZ4_FORCE_MEMORY_ACCESS 2 +# elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__) +# define LZ4_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support hardware bit count + */ +#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for WinCE doesn't support Hardware bit count */ +# define LZ4_FORCE_SW_BITCOUNT +#endif + + + +/*-************************************ +* Dependency +**************************************/ +/* + * LZ4_SRC_INCLUDED: + * Amalgamation flag, whether lz4.c is included + */ +#ifndef LZ4_SRC_INCLUDED +# define LZ4_SRC_INCLUDED 1 +#endif + +#ifndef LZ4_STATIC_LINKING_ONLY +#define LZ4_STATIC_LINKING_ONLY +#endif + +#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS +#define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to LZ4_decompress_safe_withPrefix64k */ +#endif + +#include "tracy_lz4.hpp" +/* see also "memory routines" below */ + + +/*-************************************ +* Compiler Options +**************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# include <intrin.h> +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4293) /* disable: C4293: too large shift (32-bits) */ +#endif /* _MSC_VER */ + +#ifndef LZ4_FORCE_INLINE +# ifdef _MSC_VER /* Visual Studio */ +# define LZ4_FORCE_INLINE static __forceinline +# else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define LZ4_FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define LZ4_FORCE_INLINE static inline +# endif +# else +# define LZ4_FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +# endif /* _MSC_VER */ +#endif /* LZ4_FORCE_INLINE */ + +/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8, + * together with a simple 8-byte copy loop as a fall-back path. + * However, this optimization hurts the decompression speed by >30%, + * because the execution does not go to the optimized loop + * for typical compressible data, and all of the preamble checks + * before going to the fall-back path become useless overhead. + * This optimization happens only with the -O3 flag, and -O2 generates + * a simple 8-byte copy loop. + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8 + * functions are annotated with __attribute__((optimize("O2"))), + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute + * of LZ4_wildCopy8 does not affect the compression speed. + */ +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__) +# define LZ4_FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2"))) +# define LZ4_FORCE_O2_INLINE_GCC_PPC64LE __attribute__((optimize("O2"))) LZ4_FORCE_INLINE +#else +# define LZ4_FORCE_O2_GCC_PPC64LE +# define LZ4_FORCE_O2_INLINE_GCC_PPC64LE static +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#ifndef likely +#define likely(expr) expect((expr) != 0, 1) +#endif +#ifndef unlikely +#define unlikely(expr) expect((expr) != 0, 0) +#endif + + +/*-************************************ +* Memory routines +**************************************/ +#include <stdlib.h> /* malloc, calloc, free */ +#define ALLOC(s) malloc(s) +#define ALLOC_AND_ZERO(s) calloc(1,s) +#define FREEMEM(p) free(p) +#include <string.h> /* memset, memcpy */ +#define MEM_INIT(p,v,s) memset((p),(v),(s)) + + +/*-************************************ +* Types +**************************************/ +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# include <stdint.h> + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef uintptr_t uptrval; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; + typedef size_t uptrval; /* generally true, except OpenVMS-64 */ +#endif + +#if defined(__x86_64__) + typedef U64 reg_t; /* 64-bits in x32 mode */ +#else + typedef size_t reg_t; /* 32-bits in x32 mode */ +#endif + +namespace tracy +{ + +typedef enum { + notLimited = 0, + limitedOutput = 1, + fillOutput = 2 +} limitedOutput_directive; + + +/*-************************************ +* Reading and writing into memory +**************************************/ +static unsigned LZ4_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + + +#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2) +/* lie to the compiler about data alignment; use with caution */ + +static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; } +static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; } +static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; } + +static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } + +#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign; + +static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } +static U32 LZ4_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } +static reg_t LZ4_read_ARCH(const void* ptr) { return ((const unalign*)ptr)->uArch; } + +static void LZ4_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } +static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; } + +#else /* safe and portable access using memcpy() */ + +static U16 LZ4_read16(const void* memPtr) +{ + U16 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static U32 LZ4_read32(const void* memPtr) +{ + U32 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static reg_t LZ4_read_ARCH(const void* memPtr) +{ + reg_t val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static void LZ4_write16(void* memPtr, U16 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +static void LZ4_write32(void* memPtr, U32 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +#endif /* LZ4_FORCE_MEMORY_ACCESS */ + + +static U16 LZ4_readLE16(const void* memPtr) +{ + if (LZ4_isLittleEndian()) { + return LZ4_read16(memPtr); + } else { + const BYTE* p = (const BYTE*)memPtr; + return (U16)((U16)p[0] + (p[1]<<8)); + } +} + +static void LZ4_writeLE16(void* memPtr, U16 value) +{ + if (LZ4_isLittleEndian()) { + LZ4_write16(memPtr, value); + } else { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */ +LZ4_FORCE_O2_INLINE_GCC_PPC64LE +void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + + do { memcpy(d,s,8); d+=8; s+=8; } while (d<e); +} + +static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4}; +static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3}; + + +#ifndef LZ4_FAST_DEC_LOOP +# if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 +# define LZ4_FAST_DEC_LOOP 1 +# else +# define LZ4_FAST_DEC_LOOP 0 +# endif +#endif + +#if LZ4_FAST_DEC_LOOP + +LZ4_FORCE_O2_INLINE_GCC_PPC64LE void +LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) +{ + if (offset < 8) { + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + srcPtr += inc32table[offset]; + memcpy(dstPtr+4, srcPtr, 4); + srcPtr -= dec64table[offset]; + dstPtr += 8; + } else { + memcpy(dstPtr, srcPtr, 8); + dstPtr += 8; + srcPtr += 8; + } + + LZ4_wildCopy8(dstPtr, srcPtr, dstEnd); +} + +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd + * this version copies two times 16 bytes (instead of one time 32 bytes) + * because it must be compatible with offsets >= 16. */ +LZ4_FORCE_O2_INLINE_GCC_PPC64LE void +LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + + do { memcpy(d,s,16); memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e); +} + +LZ4_FORCE_O2_INLINE_GCC_PPC64LE void +LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) +{ + BYTE v[8]; + switch(offset) { + case 1: + memset(v, *srcPtr, 8); + goto copy_loop; + case 2: + memcpy(v, srcPtr, 2); + memcpy(&v[2], srcPtr, 2); + memcpy(&v[4], &v[0], 4); + goto copy_loop; + case 4: + memcpy(v, srcPtr, 4); + memcpy(&v[4], srcPtr, 4); + goto copy_loop; + default: + LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset); + return; + } + + copy_loop: + memcpy(dstPtr, v, 8); + dstPtr += 8; + while (dstPtr < dstEnd) { + memcpy(dstPtr, v, 8); + dstPtr += 8; + } +} +#endif + + +/*-************************************ +* Common Constants +**************************************/ +#define MINMATCH 4 + +#define WILDCOPYLENGTH 8 +#define LASTLITERALS 5 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ +#define MFLIMIT 12 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ +#define MATCH_SAFEGUARD_DISTANCE ((2*WILDCOPYLENGTH) - MINMATCH) /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */ +#define FASTLOOP_SAFE_DISTANCE 64 +static const int LZ4_minLength = (MFLIMIT+1); + +#define KB *(1 <<10) +#define MB *(1 <<20) +#define GB *(1U<<30) + +#ifndef LZ4_DISTANCE_MAX /* can be user - defined at compile time */ +# define LZ4_DISTANCE_MAX 65535 +#endif + +#if (LZ4_DISTANCE_MAX > 65535) /* max supported by LZ4 format */ +# error "LZ4_DISTANCE_MAX is too big : must be <= 65535" +#endif + +#define ML_BITS 4 +#define ML_MASK ((1U<<ML_BITS)-1) +#define RUN_BITS (8-ML_BITS) +#define RUN_MASK ((1U<<RUN_BITS)-1) + + +/*-************************************ +* Error detection +**************************************/ +#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1) +# include <assert.h> +#else +# ifndef assert +# define assert(condition) ((void)0) +# endif +#endif + +#define LZ4_STATIC_ASSERT(c) { enum { LZ4_static_assert = 1/(int)(!!(c)) }; } /* use after variable declarations */ + +#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2) +# include <stdio.h> +static int g_debuglog_enable = 1; +# define DEBUGLOG(l, ...) { \ + if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) { \ + fprintf(stderr, __FILE__ ": "); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, " \n"); \ + } } +#else +# define DEBUGLOG(l, ...) {} /* disabled */ +#endif + + +/*-************************************ +* Common functions +**************************************/ +static unsigned LZ4_NbCommonBytes (reg_t val) +{ + if (LZ4_isLittleEndian()) { + if (sizeof(val)==8) { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, (U64)val ); + return (int)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, + 0, 3, 1, 3, 1, 4, 2, 7, + 0, 2, 3, 6, 1, 5, 3, 5, + 1, 3, 4, 4, 2, 5, 6, 7, + 7, 0, 1, 2, 3, 3, 4, 6, + 2, 6, 5, 5, 3, 4, 5, 6, + 7, 1, 2, 4, 6, 4, 4, 5, + 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else /* 32 bits */ { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, (U32)val ); + return (int)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else /* Big Endian CPU */ { + if (sizeof(val)==8) { /* 64-bits */ +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll((U64)val) >> 3); +# else + static const U32 by32 = sizeof(val)*4; /* 32 on 64 bits (goal), 16 on 32 bits. + Just to avoid some static analyzer complaining about shift by 32 on 32-bits target. + Note that this code path is never triggered in 32-bits mode. */ + unsigned r; + if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } else /* 32 bits */ { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } + } +} + +#define STEPSIZE sizeof(reg_t) +LZ4_FORCE_INLINE +unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + if (likely(pIn < pInLimit-(STEPSIZE-1))) { + reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); + if (!diff) { + pIn+=STEPSIZE; pMatch+=STEPSIZE; + } else { + return LZ4_NbCommonBytes(diff); + } } + + while (likely(pIn < pInLimit-(STEPSIZE-1))) { + reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); + if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; } + pIn += LZ4_NbCommonBytes(diff); + return (unsigned)(pIn - pStart); + } + + if ((STEPSIZE==8) && (pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; } + if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; } + if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++; + return (unsigned)(pIn - pStart); +} + + +#ifndef LZ4_COMMONDEFS_ONLY +/*-************************************ +* Local Constants +**************************************/ +static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1)); +static const U32 LZ4_skipTrigger = 6; /* Increase this value ==> compression run slower on incompressible data */ + + +/*-************************************ +* Local Structures and types +**************************************/ +typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t; + +/** + * This enum distinguishes several different modes of accessing previous + * content in the stream. + * + * - noDict : There is no preceding content. + * - withPrefix64k : Table entries up to ctx->dictSize before the current blob + * blob being compressed are valid and refer to the preceding + * content (of length ctx->dictSize), which is available + * contiguously preceding in memory the content currently + * being compressed. + * - usingExtDict : Like withPrefix64k, but the preceding content is somewhere + * else in memory, starting at ctx->dictionary with length + * ctx->dictSize. + * - usingDictCtx : Like usingExtDict, but everything concerning the preceding + * content is in a separate context, pointed to by + * ctx->dictCtx. ctx->dictionary, ctx->dictSize, and table + * entries in the current context that refer to positions + * preceding the beginning of the current compression are + * ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx + * ->dictSize describe the location and size of the preceding + * content, and matches are found by looking in the ctx + * ->dictCtx->hashTable. + */ +typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; + + +/*-************************************ +* Local Utils +**************************************/ +int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } +const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; } +int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } +int LZ4_sizeofState() { return LZ4_STREAMSIZE; } + + +/*-************************************ +* Internal Definitions used in Tests +**************************************/ + +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize); + +int LZ4_decompress_safe_forceExtDict(const char* in, char* out, int inSize, int outSize, const void* dict, size_t dictSize); + +/*-****************************** +* Compression functions +********************************/ +static U32 LZ4_hash4(U32 sequence, tableType_t const tableType) +{ + if (tableType == byU16) + return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +static U32 LZ4_hash5(U64 sequence, tableType_t const tableType) +{ + const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; + if (LZ4_isLittleEndian()) { + const U64 prime5bytes = 889523592379ULL; + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + } else { + const U64 prime8bytes = 11400714785074694791ULL; + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); + } +} + +LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType) +{ + if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType); + return LZ4_hash4(LZ4_read32(p), tableType); +} + +static void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType) +{ + switch (tableType) + { + default: /* fallthrough */ + case clearedTable: /* fallthrough */ + case byPtr: { /* illegal! */ assert(0); return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = idx; return; } + case byU16: { U16* hashTable = (U16*) tableBase; assert(idx < 65536); hashTable[h] = (U16)idx; return; } + } +} + +static void LZ4_putPositionOnHash(const BYTE* p, U32 h, + void* tableBase, tableType_t const tableType, + const BYTE* srcBase) +{ + switch (tableType) + { + case clearedTable: { /* illegal! */ assert(0); return; } + case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; } + } +} + +LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 const h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +/* LZ4_getIndexOnHash() : + * Index of match position registered in hash table. + * hash position must be calculated by using base+index, or dictBase+index. + * Assumption 1 : only valid if tableType == byU32 or byU16. + * Assumption 2 : h is presumed valid (within limits of hash table) + */ +static U32 LZ4_getIndexOnHash(U32 h, const void* tableBase, tableType_t tableType) +{ + LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2); + if (tableType == byU32) { + const U32* const hashTable = (const U32*) tableBase; + assert(h < (1U << (LZ4_MEMORY_USAGE-2))); + return hashTable[h]; + } + if (tableType == byU16) { + const U16* const hashTable = (const U16*) tableBase; + assert(h < (1U << (LZ4_MEMORY_USAGE-1))); + return hashTable[h]; + } + assert(0); return 0; /* forbidden case */ +} + +static const BYTE* LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE* const* hashTable = (const BYTE* const*) tableBase; return hashTable[h]; } + if (tableType == byU32) { const U32* const hashTable = (const U32*) tableBase; return hashTable[h] + srcBase; } + { const U16* const hashTable = (const U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ +} + +LZ4_FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, + const void* tableBase, tableType_t tableType, + const BYTE* srcBase) +{ + U32 const h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + +LZ4_FORCE_INLINE void LZ4_prepareTable( + LZ4_stream_t_internal* const cctx, + const int inputSize, + const tableType_t tableType) { + /* If compression failed during the previous step, then the context + * is marked as dirty, therefore, it has to be fully reset. + */ + if (cctx->dirty) { + DEBUGLOG(5, "LZ4_prepareTable: Full reset for %p", cctx); + MEM_INIT(cctx, 0, sizeof(LZ4_stream_t_internal)); + return; + } + + /* If the table hasn't been used, it's guaranteed to be zeroed out, and is + * therefore safe to use no matter what mode we're in. Otherwise, we figure + * out if it's safe to leave as is or whether it needs to be reset. + */ + if (cctx->tableType != clearedTable) { + if (cctx->tableType != tableType + || (tableType == byU16 && cctx->currentOffset + inputSize >= 0xFFFFU) + || (tableType == byU32 && cctx->currentOffset > 1 GB) + || tableType == byPtr + || inputSize >= 4 KB) + { + DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", cctx); + MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE); + cctx->currentOffset = 0; + cctx->tableType = clearedTable; + } else { + DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)"); + } + } + + /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back, is faster + * than compressing without a gap. However, compressing with + * currentOffset == 0 is faster still, so we preserve that case. + */ + if (cctx->currentOffset != 0 && tableType == byU32) { + DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset"); + cctx->currentOffset += 64 KB; + } + + /* Finally, clear history */ + cctx->dictCtx = NULL; + cctx->dictionary = NULL; + cctx->dictSize = 0; +} + +/** LZ4_compress_generic() : + inlined, to ensure branches are decided at compilation time */ +LZ4_FORCE_INLINE int LZ4_compress_generic( + LZ4_stream_t_internal* const cctx, + const char* const source, + char* const dest, + const int inputSize, + int *inputConsumed, /* only written when outputDirective == fillOutput */ + const int maxOutputSize, + const limitedOutput_directive outputDirective, + const tableType_t tableType, + const dict_directive dictDirective, + const dictIssue_directive dictIssue, + const int acceleration) +{ + int result; + const BYTE* ip = (const BYTE*) source; + + U32 const startIndex = cctx->currentOffset; + const BYTE* base = (const BYTE*) source - startIndex; + const BYTE* lowLimit; + + const LZ4_stream_t_internal* dictCtx = (const LZ4_stream_t_internal*) cctx->dictCtx; + const BYTE* const dictionary = + dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary; + const U32 dictSize = + dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize; + const U32 dictDelta = (dictDirective == usingDictCtx) ? startIndex - dictCtx->currentOffset : 0; /* make indexes in dictCtx comparable with index in current context */ + + int const maybe_extMem = (dictDirective == usingExtDict) || (dictDirective == usingDictCtx); + U32 const prefixIdxLimit = startIndex - dictSize; /* used when dictDirective == dictSmall */ + const BYTE* const dictEnd = dictionary + dictSize; + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1; + const BYTE* const matchlimit = iend - LASTLITERALS; + + /* the dictCtx currentOffset is indexed on the start of the dictionary, + * while a dictionary in the current context precedes the currentOffset */ + const BYTE* dictBase = (dictDirective == usingDictCtx) ? + dictionary + dictSize - dictCtx->currentOffset : + dictionary + dictSize - startIndex; + + BYTE* op = (BYTE*) dest; + BYTE* const olimit = op + maxOutputSize; + + U32 offset = 0; + U32 forwardH; + + DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, tableType=%u", inputSize, tableType); + /* If init conditions are not met, we don't have to mark stream + * as having dirty context, since no action was taken yet */ + if (outputDirective == fillOutput && maxOutputSize < 1) return 0; /* Impossible to store anything */ + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported inputSize, too large (or negative) */ + if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (tableType==byPtr) assert(dictDirective==noDict); /* only supported use case with byPtr */ + assert(acceleration >= 1); + + lowLimit = (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0); + + /* Update context state */ + if (dictDirective == usingDictCtx) { + /* Subsequent linked blocks can't use the dictionary. */ + /* Instead, they use the block we just compressed. */ + cctx->dictCtx = NULL; + cctx->dictSize = (U32)inputSize; + } else { + cctx->dictSize += (U32)inputSize; + } + cctx->currentOffset += (U32)inputSize; + cctx->tableType = (U16)tableType; + + if (inputSize<LZ4_minLength) goto _last_literals; /* Input too small, no compression (all literals) */ + + /* First Byte */ + LZ4_putPosition(ip, cctx->hashTable, tableType, base); + ip++; forwardH = LZ4_hashPosition(ip, tableType); + + /* Main Loop */ + for ( ; ; ) { + const BYTE* match; + BYTE* token; + + /* Find a match */ + if (tableType == byPtr) { + const BYTE* forwardIp = ip; + int step = 1; + int searchMatchNb = acceleration << LZ4_skipTrigger; + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals; + assert(ip < mflimitPlusOne); + + match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType, base); + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType, base); + + } while ( (match+LZ4_DISTANCE_MAX < ip) + || (LZ4_read32(match) != LZ4_read32(ip)) ); + + } else { /* byU32, byU16 */ + + const BYTE* forwardIp = ip; + int step = 1; + int searchMatchNb = acceleration << LZ4_skipTrigger; + do { + U32 const h = forwardH; + U32 const current = (U32)(forwardIp - base); + U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType); + assert(matchIndex <= current); + assert(forwardIp - base < (ptrdiff_t)(2 GB - 1)); + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals; + assert(ip < mflimitPlusOne); + + if (dictDirective == usingDictCtx) { + if (matchIndex < startIndex) { + /* there was no match, try the dictionary */ + assert(tableType == byU32); + matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32); + match = dictBase + matchIndex; + matchIndex += dictDelta; /* make dictCtx index comparable with current context */ + lowLimit = dictionary; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; + } + } else if (dictDirective==usingExtDict) { + if (matchIndex < startIndex) { + DEBUGLOG(7, "extDict candidate: matchIndex=%5u < startIndex=%5u", matchIndex, startIndex); + assert(startIndex - matchIndex >= MINMATCH); + match = dictBase + matchIndex; + lowLimit = dictionary; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; + } + } else { /* single continuous memory segment */ + match = base + matchIndex; + } + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); + + if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) continue; /* match outside of valid area */ + assert(matchIndex < current); + if ((tableType != byU16) && (matchIndex+LZ4_DISTANCE_MAX < current)) continue; /* too far */ + if (tableType == byU16) assert((current - matchIndex) <= LZ4_DISTANCE_MAX); /* too_far presumed impossible with byU16 */ + + if (LZ4_read32(match) == LZ4_read32(ip)) { + if (maybe_extMem) offset = current - matchIndex; + break; /* match found */ + } + + } while(1); + } + + /* Catch up */ + while (((ip>anchor) & (match > lowLimit)) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; } + + /* Encode Literals */ + { unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + if ((outputDirective == limitedOutput) && /* Check output buffer overflow */ + (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)) ) + return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ + + if ((outputDirective == fillOutput) && + (unlikely(op + (litLength+240)/255 /* litlen */ + litLength /* literals */ + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit))) { + op--; + goto _last_literals; + } + if (litLength >= RUN_MASK) { + int len = (int)(litLength - RUN_MASK); + *token = (RUN_MASK<<ML_BITS); + for(; len >= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength<<ML_BITS); + + /* Copy Literals */ + LZ4_wildCopy8(op, anchor, op+litLength); + op+=litLength; + DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i", + (int)(anchor-(const BYTE*)source), litLength, (int)(ip-(const BYTE*)source)); + } + +_next_match: + /* at this stage, the following variables must be correctly set : + * - ip : at start of LZ operation + * - match : at start of previous pattern occurence; can be within current prefix, or within extDict + * - offset : if maybe_ext_memSegment==1 (constant) + * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise + * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written + */ + + if ((outputDirective == fillOutput) && + (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit)) { + /* the match was too close to the end, rewind and go to last literals */ + op = token; + goto _last_literals; + } + + /* Encode Offset */ + if (maybe_extMem) { /* static test */ + DEBUGLOG(6, " with offset=%u (ext if > %i)", offset, (int)(ip - (const BYTE*)source)); + assert(offset <= LZ4_DISTANCE_MAX && offset > 0); + LZ4_writeLE16(op, (U16)offset); op+=2; + } else { + DEBUGLOG(6, " with offset=%u (same segment)", (U32)(ip - match)); + assert(ip-match <= LZ4_DISTANCE_MAX); + LZ4_writeLE16(op, (U16)(ip - match)); op+=2; + } + + /* Encode MatchLength */ + { unsigned matchCode; + + if ( (dictDirective==usingExtDict || dictDirective==usingDictCtx) + && (lowLimit==dictionary) /* match within extDict */ ) { + const BYTE* limit = ip + (dictEnd-match); + assert(dictEnd > match); + if (limit > matchlimit) limit = matchlimit; + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); + ip += (size_t)matchCode + MINMATCH; + if (ip==limit) { + unsigned const more = LZ4_count(limit, (const BYTE*)source, matchlimit); + matchCode += more; + ip += more; + } + DEBUGLOG(6, " with matchLength=%u starting in extDict", matchCode+MINMATCH); + } else { + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); + ip += (size_t)matchCode + MINMATCH; + DEBUGLOG(6, " with matchLength=%u", matchCode+MINMATCH); + } + + if ((outputDirective) && /* Check output buffer overflow */ + (unlikely(op + (1 + LASTLITERALS) + (matchCode>>8) > olimit)) ) { + if (outputDirective == fillOutput) { + /* Match description too long : reduce it */ + U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 2 - 1 - LASTLITERALS) * 255; + ip -= matchCode - newMatchCode; + matchCode = newMatchCode; + } else { + assert(outputDirective == limitedOutput); + return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ + } + } + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LZ4_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*255) { + op+=4; + LZ4_write32(op, 0xFFFFFFFF); + matchCode -= 4*255; + } + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else + *token += (BYTE)(matchCode); + } + + anchor = ip; + + /* Test end of chunk */ + if (ip >= mflimitPlusOne) break; + + /* Fill table */ + LZ4_putPosition(ip-2, cctx->hashTable, tableType, base); + + /* Test next position */ + if (tableType == byPtr) { + + match = LZ4_getPosition(ip, cctx->hashTable, tableType, base); + LZ4_putPosition(ip, cctx->hashTable, tableType, base); + if ( (match+LZ4_DISTANCE_MAX >= ip) + && (LZ4_read32(match) == LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + } else { /* byU32, byU16 */ + + U32 const h = LZ4_hashPosition(ip, tableType); + U32 const current = (U32)(ip-base); + U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType); + assert(matchIndex < current); + if (dictDirective == usingDictCtx) { + if (matchIndex < startIndex) { + /* there was no match, try the dictionary */ + matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32); + match = dictBase + matchIndex; + lowLimit = dictionary; /* required for match length counter */ + matchIndex += dictDelta; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; /* required for match length counter */ + } + } else if (dictDirective==usingExtDict) { + if (matchIndex < startIndex) { + match = dictBase + matchIndex; + lowLimit = dictionary; /* required for match length counter */ + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; /* required for match length counter */ + } + } else { /* single memory segment */ + match = base + matchIndex; + } + LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); + assert(matchIndex < current); + if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1) + && ((tableType==byU16) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current)) + && (LZ4_read32(match) == LZ4_read32(ip)) ) { + token=op++; + *token=0; + if (maybe_extMem) offset = current - matchIndex; + DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i", + (int)(anchor-(const BYTE*)source), 0, (int)(ip-(const BYTE*)source)); + goto _next_match; + } + } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + + } + +_last_literals: + /* Encode Last Literals */ + { size_t lastRun = (size_t)(iend - anchor); + if ( (outputDirective) && /* Check output buffer overflow */ + (op + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > olimit)) { + if (outputDirective == fillOutput) { + /* adapt lastRun to fill 'dst' */ + assert(olimit >= op); + lastRun = (size_t)(olimit-op) - 1; + lastRun -= (lastRun+240)/255; + } else { + assert(outputDirective == limitedOutput); + return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ + } + } + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRun<<ML_BITS); + } + memcpy(op, anchor, lastRun); + ip = anchor + lastRun; + op += lastRun; + } + + if (outputDirective == fillOutput) { + *inputConsumed = (int) (((const char*)ip)-source); + } + DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", inputSize, (int)(((char*)op) - dest)); + result = (int)(((char*)op) - dest); + assert(result > 0); + return result; +} + + +int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t_internal* const ctx = & LZ4_initStream(state, sizeof(LZ4_stream_t)) -> internal_donotuse; + assert(ctx != NULL); + if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; + if (maxOutputSize >= LZ4_compressBound(inputSize)) { + if (inputSize < LZ4_64Klimit) { + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration); + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); + } + } else { + if (inputSize < LZ4_64Klimit) {; + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration); + } + } +} + +/** + * LZ4_compress_fast_extState_fastReset() : + * A variant of LZ4_compress_fast_extState(). + * + * Using this variant avoids an expensive initialization step. It is only safe + * to call if the state buffer is known to be correctly initialized already + * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of + * "correctly initialized"). + */ +int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration) +{ + LZ4_stream_t_internal* ctx = &((LZ4_stream_t*)state)->internal_donotuse; + if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; + + if (dstCapacity >= LZ4_compressBound(srcSize)) { + if (srcSize < LZ4_64Klimit) { + const tableType_t tableType = byU16; + LZ4_prepareTable(ctx, srcSize, tableType); + if (ctx->currentOffset) { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, dictSmall, acceleration); + } else { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); + } + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + LZ4_prepareTable(ctx, srcSize, tableType); + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); + } + } else { + if (srcSize < LZ4_64Klimit) { + const tableType_t tableType = byU16; + LZ4_prepareTable(ctx, srcSize, tableType); + if (ctx->currentOffset) { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, dictSmall, acceleration); + } else { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration); + } + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + LZ4_prepareTable(ctx, srcSize, tableType); + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration); + } + } +} + + +int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + int result; +#if (LZ4_HEAPMODE) + LZ4_stream_t* ctxPtr = ALLOC(sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ + if (ctxPtr == NULL) return 0; +#else + LZ4_stream_t ctx; + LZ4_stream_t* const ctxPtr = &ctx; +#endif + result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); + +#if (LZ4_HEAPMODE) + FREEMEM(ctxPtr); +#endif + return result; +} + + +int LZ4_compress_default(const char* src, char* dst, int srcSize, int maxOutputSize) +{ + return LZ4_compress_fast(src, dst, srcSize, maxOutputSize, 1); +} + + +/* hidden debug function */ +/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */ +int LZ4_compress_fast_force(const char* src, char* dst, int srcSize, int dstCapacity, int acceleration) +{ + LZ4_stream_t ctx; + LZ4_initStream(&ctx, sizeof(ctx)); + + if (srcSize < LZ4_64Klimit) { + return LZ4_compress_generic(&ctx.internal_donotuse, src, dst, srcSize, NULL, dstCapacity, limitedOutput, byU16, noDict, noDictIssue, acceleration); + } else { + tableType_t const addrMode = (sizeof(void*) > 4) ? byU32 : byPtr; + return LZ4_compress_generic(&ctx.internal_donotuse, src, dst, srcSize, NULL, dstCapacity, limitedOutput, addrMode, noDict, noDictIssue, acceleration); + } +} + + +/* Note!: This function leaves the stream in an unclean/broken state! + * It is not safe to subsequently use the same state with a _fastReset() or + * _continue() call without resetting it. */ +static int LZ4_compress_destSize_extState (LZ4_stream_t* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize) +{ + void* const s = LZ4_initStream(state, sizeof (*state)); + assert(s != NULL); (void)s; + + if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) { /* compression success is guaranteed */ + return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); + } else { + if (*srcSizePtr < LZ4_64Klimit) { + return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, 1); + } else { + tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, 1); + } } +} + + +int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) +{ +#if (LZ4_HEAPMODE) + LZ4_stream_t* ctx = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ + if (ctx == NULL) return 0; +#else + LZ4_stream_t ctxBody; + LZ4_stream_t* ctx = &ctxBody; +#endif + + int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); + +#if (LZ4_HEAPMODE) + FREEMEM(ctx); +#endif + return result; +} + + + +/*-****************************** +* Streaming functions +********************************/ + +LZ4_stream_t* LZ4_createStream(void) +{ + LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); + LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ + DEBUGLOG(4, "LZ4_createStream %p", lz4s); + if (lz4s == NULL) return NULL; + LZ4_initStream(lz4s, sizeof(*lz4s)); + return lz4s; +} + +#ifndef _MSC_VER /* for some reason, Visual fails the aligment test on 32-bit x86 : + it reports an aligment of 8-bytes, + while actually aligning LZ4_stream_t on 4 bytes. */ +static size_t LZ4_stream_t_alignment(void) +{ + struct { char c; LZ4_stream_t t; } t_a; + return sizeof(t_a) - sizeof(t_a.t); +} +#endif + +LZ4_stream_t* LZ4_initStream (void* buffer, size_t size) +{ + DEBUGLOG(5, "LZ4_initStream"); + if (buffer == NULL) return NULL; + if (size < sizeof(LZ4_stream_t)) return NULL; +#ifndef _MSC_VER /* for some reason, Visual fails the aligment test on 32-bit x86 : + it reports an aligment of 8-bytes, + while actually aligning LZ4_stream_t on 4 bytes. */ + if (((size_t)buffer) & (LZ4_stream_t_alignment() - 1)) return NULL; /* alignment check */ +#endif + MEM_INIT(buffer, 0, sizeof(LZ4_stream_t)); + return (LZ4_stream_t*)buffer; +} + +/* resetStream is now deprecated, + * prefer initStream() which is more general */ +void LZ4_resetStream (LZ4_stream_t* LZ4_stream) +{ + DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", LZ4_stream); + MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t)); +} + +void LZ4_resetStream_fast(LZ4_stream_t* ctx) { + LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32); +} + +int LZ4_freeStream (LZ4_stream_t* LZ4_stream) +{ + if (!LZ4_stream) return 0; /* support free on NULL */ + DEBUGLOG(5, "LZ4_freeStream %p", LZ4_stream); + FREEMEM(LZ4_stream); + return (0); +} + + +#define HASH_UNIT sizeof(reg_t) +int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) +{ + LZ4_stream_t_internal* dict = &LZ4_dict->internal_donotuse; + const tableType_t tableType = byU32; + const BYTE* p = (const BYTE*)dictionary; + const BYTE* const dictEnd = p + dictSize; + const BYTE* base; + + DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, dictionary, LZ4_dict); + + /* It's necessary to reset the context, + * and not just continue it with prepareTable() + * to avoid any risk of generating overflowing matchIndex + * when compressing using this dictionary */ + LZ4_resetStream(LZ4_dict); + + /* We always increment the offset by 64 KB, since, if the dict is longer, + * we truncate it to the last 64k, and if it's shorter, we still want to + * advance by a whole window length so we can provide the guarantee that + * there are only valid offsets in the window, which allows an optimization + * in LZ4_compress_fast_continue() where it uses noDictIssue even when the + * dictionary isn't a full 64k. */ + + if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; + base = dictEnd - 64 KB - dict->currentOffset; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->currentOffset += 64 KB; + dict->tableType = tableType; + + if (dictSize < (int)HASH_UNIT) { + return 0; + } + + while (p <= dictEnd-HASH_UNIT) { + LZ4_putPosition(p, dict->hashTable, tableType, base); + p+=3; + } + + return (int)dict->dictSize; +} + +void LZ4_attach_dictionary(LZ4_stream_t *working_stream, const LZ4_stream_t *dictionary_stream) { + /* Calling LZ4_resetStream_fast() here makes sure that changes will not be + * erased by subsequent calls to LZ4_resetStream_fast() in case stream was + * marked as having dirty context, e.g. requiring full reset. + */ + LZ4_resetStream_fast(working_stream); + + if (dictionary_stream != NULL) { + /* If the current offset is zero, we will never look in the + * external dictionary context, since there is no value a table + * entry can take that indicate a miss. In that case, we need + * to bump the offset to something non-zero. + */ + if (working_stream->internal_donotuse.currentOffset == 0) { + working_stream->internal_donotuse.currentOffset = 64 KB; + } + working_stream->internal_donotuse.dictCtx = &(dictionary_stream->internal_donotuse); + } else { + working_stream->internal_donotuse.dictCtx = NULL; + } +} + + +static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize) +{ + assert(nextSize >= 0); + if (LZ4_dict->currentOffset + (unsigned)nextSize > 0x80000000) { /* potential ptrdiff_t overflow (32-bits mode) */ + /* rescale hash table */ + U32 const delta = LZ4_dict->currentOffset - 64 KB; + const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + DEBUGLOG(4, "LZ4_renormDictT"); + for (i=0; i<LZ4_HASH_SIZE_U32; i++) { + if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0; + else LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 KB; + if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + + +int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, + const char* source, char* dest, + int inputSize, int maxOutputSize, + int acceleration) +{ + const tableType_t tableType = byU32; + LZ4_stream_t_internal* streamPtr = &LZ4_stream->internal_donotuse; + const BYTE* dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i)", inputSize); + + if (streamPtr->dirty) return 0; /* Uninitialized structure detected */ + LZ4_renormDictT(streamPtr, inputSize); /* avoid index overflow */ + if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; + + /* invalidate tiny dictionaries */ + if ( (streamPtr->dictSize-1 < 4-1) /* intentional underflow */ + && (dictEnd != (const BYTE*)source) ) { + DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, streamPtr->dictionary); + streamPtr->dictSize = 0; + streamPtr->dictionary = (const BYTE*)source; + dictEnd = (const BYTE*)source; + } + + /* Check overlapping input/dictionary space */ + { const BYTE* sourceEnd = (const BYTE*) source + inputSize; + if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; + if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; + streamPtr->dictionary = dictEnd - streamPtr->dictSize; + } + } + + /* prefix mode : source data follows dictionary */ + if (dictEnd == (const BYTE*)source) { + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration); + else + return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration); + } + + /* external dictionary mode */ + { int result; + if (streamPtr->dictCtx) { + /* We depend here on the fact that dictCtx'es (produced by + * LZ4_loadDict) guarantee that their tables contain no references + * to offsets between dictCtx->currentOffset - 64 KB and + * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe + * to use noDictIssue even when the dict isn't a full 64 KB. + */ + if (inputSize > 4 KB) { + /* For compressing large blobs, it is faster to pay the setup + * cost to copy the dictionary's tables into the active context, + * so that the compression loop is only looking into one table. + */ + memcpy(streamPtr, streamPtr->dictCtx, sizeof(LZ4_stream_t)); + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration); + } else { + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration); + } + } else { + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) { + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration); + } else { + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration); + } + } + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + return result; + } +} + + +/* Hidden debug function, to force-test external dictionary mode */ +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize) +{ + LZ4_stream_t_internal* streamPtr = &LZ4_dict->internal_donotuse; + int result; + + LZ4_renormDictT(streamPtr, srcSize); + + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) { + result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, dictSmall, 1); + } else { + result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); + } + + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)srcSize; + + return result; +} + + +/*! LZ4_saveDict() : + * If previously compressed data block is not guaranteed to remain available at its memory location, + * save it into a safer place (char* safeBuffer). + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue(). + * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. + */ +int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) +{ + LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; + const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize; + + if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ + if ((U32)dictSize > dict->dictSize) dictSize = (int)dict->dictSize; + + memmove(safeBuffer, previousDictEnd - dictSize, dictSize); + + dict->dictionary = (const BYTE*)safeBuffer; + dict->dictSize = (U32)dictSize; + + return dictSize; +} + + + +/*-******************************* + * Decompression functions + ********************************/ + +typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; +typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; + +#undef MIN +#define MIN(a,b) ( (a) < (b) ? (a) : (b) ) + +/* Read the variable-length literal or match length. + * + * ip - pointer to use as input. + * lencheck - end ip. Return an error if ip advances >= lencheck. + * loop_check - check ip >= lencheck in body of loop. Returns loop_error if so. + * initial_check - check ip >= lencheck before start of loop. Returns initial_error if so. + * error (output) - error code. Should be set to 0 before call. + */ +typedef enum { loop_error = -2, initial_error = -1, ok = 0 } variable_length_error; +LZ4_FORCE_INLINE unsigned +read_variable_length(const BYTE**ip, const BYTE* lencheck, int loop_check, int initial_check, variable_length_error* error) +{ + unsigned length = 0; + unsigned s; + if (initial_check && unlikely((*ip) >= lencheck)) { /* overflow detection */ + *error = initial_error; + return length; + } + do { + s = **ip; + (*ip)++; + length += s; + if (loop_check && unlikely((*ip) >= lencheck)) { /* overflow detection */ + *error = loop_error; + return length; + } + } while (s==255); + + return length; +} + +/*! LZ4_decompress_generic() : + * This generic decompression function covers all use cases. + * It shall be instantiated several times, using different sets of directives. + * Note that it is important for performance that this function really get inlined, + * in order to remove useless branches during compilation optimization. + */ +LZ4_FORCE_INLINE int +LZ4_decompress_generic( + const char* const src, + char* const dst, + int srcSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is `dstCapacity` */ + + endCondition_directive endOnInput, /* endOnOutputSize, endOnInputSize */ + earlyEnd_directive partialDecoding, /* full, partial */ + dict_directive dict, /* noDict, withPrefix64k, usingExtDict */ + const BYTE* const lowPrefix, /* always <= dst, == dst when no prefix */ + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note : = 0 if noDict */ + ) +{ + if (src == NULL) return -1; + + { const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + + BYTE* op = (BYTE*) dst; + BYTE* const oend = op + outputSize; + BYTE* cpy; + + const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize; + + const int safeDecode = (endOnInput==endOnInputSize); + const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); + + + /* Set up the "end" pointers for the shortcut. */ + const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/; + const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/; + + const BYTE* match; + size_t offset; + unsigned token; + size_t length; + + + DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize); + + /* Special cases */ + assert(lowPrefix <= op); + if ((endOnInput) && (unlikely(outputSize==0))) return ((srcSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ + if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0 ? 1 : -1); + if ((endOnInput) && unlikely(srcSize==0)) return -1; + + /* Currently the fast loop shows a regression on qualcomm arm chips. */ +#if LZ4_FAST_DEC_LOOP + if ((oend - op) < FASTLOOP_SAFE_DISTANCE) { + DEBUGLOG(6, "skip fast decode loop"); + goto safe_decode; + } + + /* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */ + while (1) { + /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */ + assert(oend - op >= FASTLOOP_SAFE_DISTANCE); + if (endOnInput) assert(ip < iend); + token = *ip++; + length = token >> ML_BITS; /* literal length */ + + assert(!endOnInput || ip <= iend); /* ip < iend before the increment */ + + /* decode literal length */ + if (length == RUN_MASK) { + variable_length_error error = ok; + length += read_variable_length(&ip, iend-RUN_MASK, endOnInput, endOnInput, &error); + if (error == initial_error) goto _output_error; + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) goto _output_error; /* overflow detection */ + if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) goto _output_error; /* overflow detection */ + + /* copy literals */ + cpy = op+length; + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + if (endOnInput) { /* LZ4_decompress_safe() */ + if ((cpy>oend-32) || (ip+length>iend-32)) goto safe_literal_copy; + LZ4_wildCopy32(op, ip, cpy); + } else { /* LZ4_decompress_fast() */ + if (cpy>oend-8) goto safe_literal_copy; + LZ4_wildCopy8(op, ip, cpy); /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time : + * it doesn't know input length, and only relies on end-of-block properties */ + } + ip += length; op = cpy; + } else { + cpy = op+length; + if (endOnInput) { /* LZ4_decompress_safe() */ + DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length); + /* We don't need to check oend, since we check it once for each loop below */ + if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) goto safe_literal_copy; + /* Literals can only be 14, but hope compilers optimize if we copy by a register size */ + memcpy(op, ip, 16); + } else { /* LZ4_decompress_fast() */ + /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time : + * it doesn't know input length, and relies on end-of-block properties */ + memcpy(op, ip, 8); + if (length > 8) memcpy(op+8, ip+8, 8); + } + ip += length; op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); ip+=2; + match = op - offset; + + /* get matchlength */ + length = token & ML_MASK; + + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */ + + if (length == ML_MASK) { + variable_length_error error = ok; + length += read_variable_length(&ip, iend - LASTLITERALS + 1, endOnInput, 0, &error); + if (error != ok) goto _output_error; + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ + length += MINMATCH; + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { + goto safe_match_copy; + } + } else { + length += MINMATCH; + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { + goto safe_match_copy; + } + + /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */ + if (!(dict == usingExtDict) || (match >= lowPrefix)) { + if (offset >= 8) { + memcpy(op, match, 8); + memcpy(op+8, match+8, 8); + memcpy(op+16, match+16, 2); + op += length; + continue; + } } } + + /* match starting within external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) { + if (unlikely(op+length > oend-LASTLITERALS)) { + if (partialDecoding) length = MIN(length, (size_t)(oend-op)); + else goto _output_error; /* doesn't respect parsing restriction */ + } + + if (length <= (size_t)(lowPrefix-match)) { + /* match fits entirely within external dictionary : just copy */ + memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match stretches into both external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) *op++ = *copyFrom++; + } else { + memcpy(op, lowPrefix, restSize); + op += restSize; + } } + continue; + } + + /* copy match within block */ + cpy = op + length; + + assert((op <= oend) && (oend-op >= 32)); + if (unlikely(offset<16)) { + LZ4_memcpy_using_offset(op, match, cpy, offset); + } else { + LZ4_wildCopy32(op, match, cpy); + } + + op = cpy; /* wildcopy correction */ + } + safe_decode: +#endif + + /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */ + while (1) { + token = *ip++; + length = token >> ML_BITS; /* literal length */ + + assert(!endOnInput || ip <= iend); /* ip < iend before the increment */ + + /* A two-stage shortcut for the most common case: + * 1) If the literal length is 0..14, and there is enough space, + * enter the shortcut and copy 16 bytes on behalf of the literals + * (in the fast mode, only 8 bytes can be safely copied this way). + * 2) Further if the match length is 4..18, copy 18 bytes in a similar + * manner; but we ensure that there's enough space in the output for + * those 18 bytes earlier, upon entering the shortcut (in other words, + * there is a combined check for both stages). + */ + if ( (endOnInput ? length != RUN_MASK : length <= 8) + /* strictly "less than" on input, to re-enter the loop with at least one byte */ + && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) { + /* Copy the literals */ + memcpy(op, ip, endOnInput ? 16 : 8); + op += length; ip += length; + + /* The second stage: prepare for match copying, decode full info. + * If it doesn't work out, the info won't be wasted. */ + length = token & ML_MASK; /* match length */ + offset = LZ4_readLE16(ip); ip += 2; + match = op - offset; + assert(match <= op); /* check overflow */ + + /* Do not deal with overlapping matches. */ + if ( (length != ML_MASK) + && (offset >= 8) + && (dict==withPrefix64k || match >= lowPrefix) ) { + /* Copy the match. */ + memcpy(op + 0, match + 0, 8); + memcpy(op + 8, match + 8, 8); + memcpy(op +16, match +16, 2); + op += length + MINMATCH; + /* Both stages worked, load the next token. */ + continue; + } + + /* The second stage didn't work out, but the info is ready. + * Propel it right to the point of match copying. */ + goto _copy_match; + } + + /* decode literal length */ + if (length == RUN_MASK) { + variable_length_error error = ok; + length += read_variable_length(&ip, iend-RUN_MASK, endOnInput, endOnInput, &error); + if (error == initial_error) goto _output_error; + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) goto _output_error; /* overflow detection */ + if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) goto _output_error; /* overflow detection */ + } + + /* copy literals */ + cpy = op+length; +#if LZ4_FAST_DEC_LOOP + safe_literal_copy: +#endif + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + if ( ((endOnInput) && ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) ) + { + if (partialDecoding) { + if (cpy > oend) { cpy = oend; assert(op<=oend); length = (size_t)(oend-op); } /* Partial decoding : stop in the middle of literal segment */ + if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ + } else { + if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ + } + memcpy(op, ip, length); + ip += length; + op += length; + if (!partialDecoding || (cpy == oend)) { + /* Necessarily EOF, due to parsing restrictions */ + break; + } + + } else { + LZ4_wildCopy8(op, ip, cpy); /* may overwrite up to WILDCOPYLENGTH beyond cpy */ + ip += length; op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); ip+=2; + match = op - offset; + + /* get matchlength */ + length = token & ML_MASK; + + _copy_match: + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */ + if (!partialDecoding) { + assert(oend > op); + assert(oend - op >= 4); + LZ4_write32(op, 0); /* silence an msan warning when offset==0; costs <1%; */ + } /* note : when partialDecoding, there is no guarantee that at least 4 bytes remain available in output buffer */ + + if (length == ML_MASK) { + variable_length_error error = ok; + length += read_variable_length(&ip, iend - LASTLITERALS + 1, endOnInput, 0, &error); + if (error != ok) goto _output_error; + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ + } + length += MINMATCH; + +#if LZ4_FAST_DEC_LOOP + safe_match_copy: +#endif + /* match starting within external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) { + if (unlikely(op+length > oend-LASTLITERALS)) { + if (partialDecoding) length = MIN(length, (size_t)(oend-op)); + else goto _output_error; /* doesn't respect parsing restriction */ + } + + if (length <= (size_t)(lowPrefix-match)) { + /* match fits entirely within external dictionary : just copy */ + memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match stretches into both external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) *op++ = *copyFrom++; + } else { + memcpy(op, lowPrefix, restSize); + op += restSize; + } } + continue; + } + + /* copy match within block */ + cpy = op + length; + + /* partialDecoding : may end anywhere within the block */ + assert(op<=oend); + if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) { + size_t const mlen = MIN(length, (size_t)(oend-op)); + const BYTE* const matchEnd = match + mlen; + BYTE* const copyEnd = op + mlen; + if (matchEnd > op) { /* overlap copy */ + while (op < copyEnd) *op++ = *match++; + } else { + memcpy(op, match, mlen); + } + op = copyEnd; + if (op==oend) break; + continue; + } + + if (unlikely(offset<8)) { + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += inc32table[offset]; + memcpy(op+4, match, 4); + match -= dec64table[offset]; + } else { + memcpy(op, match, 8); + match += 8; + } + op += 8; + + if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) { + BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1); + if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals (uncompressed) */ + if (op < oCopyLimit) { + LZ4_wildCopy8(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } + while (op < cpy) *op++ = *match++; + } else { + memcpy(op, match, 8); + if (length > 16) LZ4_wildCopy8(op+8, match+8, cpy); + } + op = cpy; /* wildcopy correction */ + } + + /* end of decoding */ + if (endOnInput) + return (int) (((char*)op)-dst); /* Nb of output bytes decoded */ + else + return (int) (((const char*)ip)-src); /* Nb of input bytes read */ + + /* Overflow error detected */ + _output_error: + return (int) (-(((const char*)ip)-src))-1; + } +} + + +/*===== Instantiate the API decoding functions. =====*/ + +LZ4_FORCE_O2_GCC_PPC64LE +int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, + endOnInputSize, decode_full_block, noDict, + (BYTE*)dest, NULL, 0); +} + +LZ4_FORCE_O2_GCC_PPC64LE +int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity, + endOnInputSize, partial_decode, + noDict, (BYTE*)dst, NULL, 0); +} + +LZ4_FORCE_O2_GCC_PPC64LE +int LZ4_decompress_fast(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, decode_full_block, withPrefix64k, + (BYTE*)dest - 64 KB, NULL, 0); +} + +/*===== Instantiate a few more decoding cases, used more than once. =====*/ + +LZ4_FORCE_O2_GCC_PPC64LE /* Exported, an obsolete API function. */ +int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, withPrefix64k, + (BYTE*)dest - 64 KB, NULL, 0); +} + +/* Another obsolete API function, paired with the previous one. */ +int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) +{ + /* LZ4_decompress_fast doesn't validate match offsets, + * and thus serves well with any prefixed dictionary. */ + return LZ4_decompress_fast(source, dest, originalSize); +} + +LZ4_FORCE_O2_GCC_PPC64LE +static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize, + size_t prefixSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, noDict, + (BYTE*)dest-prefixSize, NULL, 0); +} + +LZ4_FORCE_O2_GCC_PPC64LE +int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, + int compressedSize, int maxOutputSize, + const void* dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, usingExtDict, + (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +LZ4_FORCE_O2_GCC_PPC64LE +static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize, + const void* dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, decode_full_block, usingExtDict, + (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +/* The "double dictionary" mode, for use with e.g. ring buffers: the first part + * of the dictionary is passed as prefix, and the second via dictStart + dictSize. + * These routines are used only once, in LZ4_decompress_*_continue(). + */ +LZ4_FORCE_INLINE +int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize, + size_t prefixSize, const void* dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, usingExtDict, + (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize); +} + +LZ4_FORCE_INLINE +int LZ4_decompress_fast_doubleDict(const char* source, char* dest, int originalSize, + size_t prefixSize, const void* dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, decode_full_block, usingExtDict, + (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize); +} + +/*===== streaming decompression functions =====*/ + +LZ4_streamDecode_t* LZ4_createStreamDecode(void) +{ + LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t)); + LZ4_STATIC_ASSERT(LZ4_STREAMDECODESIZE >= sizeof(LZ4_streamDecode_t_internal)); /* A compilation error here means LZ4_STREAMDECODESIZE is not large enough */ + return lz4s; +} + +int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream) +{ + if (LZ4_stream == NULL) return 0; /* support free on NULL */ + FREEMEM(LZ4_stream); + return 0; +} + +/*! LZ4_setStreamDecode() : + * Use this function to instruct where to find the dictionary. + * This function is not necessary if previous data is still available where it was decoded. + * Loading a size of 0 is allowed (same effect as no dictionary). + * @return : 1 if OK, 0 if error + */ +int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + lz4sd->prefixSize = (size_t) dictSize; + lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; + lz4sd->externalDict = NULL; + lz4sd->extDictSize = 0; + return 1; +} + +/*! LZ4_decoderRingBufferSize() : + * when setting a ring buffer for streaming decompression (optional scenario), + * provides the minimum size of this ring buffer + * to be compatible with any source respecting maxBlockSize condition. + * Note : in a ring buffer scenario, + * blocks are presumed decompressed next to each other. + * When not enough space remains for next block (remainingSize < maxBlockSize), + * decoding resumes from beginning of ring buffer. + * @return : minimum ring buffer size, + * or 0 if there is an error (invalid maxBlockSize). + */ +int LZ4_decoderRingBufferSize(int maxBlockSize) +{ + if (maxBlockSize < 0) return 0; + if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0; + if (maxBlockSize < 16) maxBlockSize = 16; + return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize); +} + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks must still be available at the memory position where they were decoded. + If it's not possible, save the relevant part of decoded data into a safe buffer, + and indicate where it stands using LZ4_setStreamDecode() +*/ +LZ4_FORCE_O2_GCC_PPC64LE +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + + if (lz4sd->prefixSize == 0) { + /* The first call, no dictionary yet. */ + assert(lz4sd->extDictSize == 0); + result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } else if (lz4sd->prefixEnd == (BYTE*)dest) { + /* They're rolling the current segment. */ + if (lz4sd->prefixSize >= 64 KB - 1) + result = LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize); + else if (lz4sd->extDictSize == 0) + result = LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, + lz4sd->prefixSize); + else + result = LZ4_decompress_safe_doubleDict(source, dest, compressedSize, maxOutputSize, + lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += (size_t)result; + lz4sd->prefixEnd += result; + } else { + /* The buffer wraps around, or they're switching to another buffer. */ + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } + + return result; +} + +LZ4_FORCE_O2_GCC_PPC64LE +int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + assert(originalSize >= 0); + + if (lz4sd->prefixSize == 0) { + assert(lz4sd->extDictSize == 0); + result = LZ4_decompress_fast(source, dest, originalSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } else if (lz4sd->prefixEnd == (BYTE*)dest) { + if (lz4sd->prefixSize >= 64 KB - 1 || lz4sd->extDictSize == 0) + result = LZ4_decompress_fast(source, dest, originalSize); + else + result = LZ4_decompress_fast_doubleDict(source, dest, originalSize, + lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += (size_t)originalSize; + lz4sd->prefixEnd += originalSize; + } else { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_fast_extDict(source, dest, originalSize, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } + + return result; +} + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as "_continue" ones, + the dictionary must be explicitly provided within parameters +*/ + +int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + if (dictSize==0) + return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize); + if (dictStart+dictSize == dest) { + if (dictSize >= 64 KB - 1) + return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize); + return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, dictSize); + } + return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, dictSize); +} + +int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) +{ + if (dictSize==0 || dictStart+dictSize == dest) + return LZ4_decompress_fast(source, dest, originalSize); + return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, dictSize); +} + + +/*=************************************************* +* Obsolete Functions +***************************************************/ +/* obsolete compression functions */ +int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) +{ + return LZ4_compress_default(source, dest, inputSize, maxOutputSize); +} +int LZ4_compress(const char* source, char* dest, int inputSize) +{ + return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); +} +int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) +{ + return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); +} +int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) +{ + return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); +} +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int dstCapacity) +{ + return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, 1); +} +int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) +{ + return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); +} + +/* +These decompression functions are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is totally equivalent to LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe +*/ +int LZ4_uncompress (const char* source, char* dest, int outputSize) +{ + return LZ4_decompress_fast(source, dest, outputSize); +} +int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) +{ + return LZ4_decompress_safe(source, dest, isize, maxOutputSize); +} + +/* Obsolete Streaming functions */ + +int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } + +int LZ4_resetStreamState(void* state, char* inputBuffer) +{ + (void)inputBuffer; + LZ4_resetStream((LZ4_stream_t*)state); + return 0; +} + +void* LZ4_create (char* inputBuffer) +{ + (void)inputBuffer; + return LZ4_createStream(); +} + +char* LZ4_slideInputBuffer (void* state) +{ + /* avoid const char * -> char * conversion warning */ + return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary; +} + +#endif /* LZ4_COMMONDEFS_ONLY */ + +} diff --git a/libs/tracy/common/tracy_lz4.hpp b/libs/tracy/common/tracy_lz4.hpp @@ -0,0 +1,679 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Header File + * Copyright (C) 2011-present, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ + + +#ifndef TRACY_LZ4_H_2983827168210 +#define TRACY_LZ4_H_2983827168210 + +/* --- Dependency --- */ +#include <stddef.h> /* size_t */ +#include <stdint.h> + +namespace tracy +{ + +/** + Introduction + + LZ4 is lossless compression algorithm, providing compression speed at 500 MB/s per core, + scalable with multi-cores CPU. It features an extremely fast decoder, with speed in + multiple GB/s per core, typically reaching RAM speed limits on multi-core systems. + + The LZ4 compression library provides in-memory compression and decompression functions. + It gives full buffer control to user. + Compression can be done in: + - a single step (described as Simple Functions) + - a single step, reusing a context (described in Advanced Functions) + - unbounded multiple steps (described as Streaming compression) + + lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md). + Decompressing a block requires additional metadata, such as its compressed size. + Each application is free to encode and pass such metadata in whichever way it wants. + + lz4.h only handle blocks, it can not generate Frames. + + Blocks are different from Frames (doc/lz4_Frame_format.md). + Frames bundle both blocks and metadata in a specified manner. + This are required for compressed data to be self-contained and portable. + Frame format is delivered through a companion API, declared in lz4frame.h. + Note that the `lz4` CLI can only manage frames. +*/ + +/*^*************************************************************** +* Export parameters +*****************************************************************/ +/* +* LZ4_DLL_EXPORT : +* Enable exporting of functions when building a Windows DLL +* LZ4LIB_VISIBILITY : +* Control library symbols visibility. +*/ +#ifndef LZ4LIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define LZ4LIB_VISIBILITY +# endif +#endif +#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1) +# define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY +#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1) +# define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define LZ4LIB_API LZ4LIB_VISIBILITY +#endif + +/*------ Version ------*/ +#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ +#define LZ4_VERSION_MINOR 9 /* for new (non-breaking) interface capabilities */ +#define LZ4_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */ + +#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) + +#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE +#define LZ4_QUOTE(str) #str +#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str) +#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION) + +LZ4LIB_API int LZ4_versionNumber (void); /**< library version number; useful to check dll version */ +LZ4LIB_API const char* LZ4_versionString (void); /**< library version string; useful to check dll version */ + + +/*-************************************ +* Tuning parameter +**************************************/ +/*! + * LZ4_MEMORY_USAGE : + * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) + * Increasing memory usage improves compression ratio. + * Reduced memory usage may improve speed, thanks to better cache locality. + * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache + */ +#ifndef LZ4_MEMORY_USAGE +# define LZ4_MEMORY_USAGE 12 +#endif + + +/*-************************************ +* Simple Functions +**************************************/ +/*! LZ4_compress_default() : + Compresses 'srcSize' bytes from buffer 'src' + into already allocated 'dst' buffer of size 'dstCapacity'. + Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize). + It also runs faster, so it's a recommended setting. + If the function cannot compress 'src' into a more limited 'dst' budget, + compression stops *immediately*, and the function result is zero. + In which case, 'dst' content is undefined (invalid). + srcSize : max supported value is LZ4_MAX_INPUT_SIZE. + dstCapacity : size of buffer 'dst' (which must be already allocated) + @return : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity) + or 0 if compression fails + Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer). +*/ +LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity); + +/*! LZ4_decompress_safe() : + compressedSize : is the exact complete size of the compressed block. + dstCapacity : is the size of destination buffer, which must be already allocated. + @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity) + If destination buffer is not large enough, decoding will stop and output an error code (negative value). + If the source stream is detected malformed, the function will stop decoding and return a negative result. + Note : This function is protected against malicious data packets (never writes outside 'dst' buffer, nor read outside 'source' buffer). +*/ +LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity); + + +/*-************************************ +* Advanced Functions +**************************************/ +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) + +/*! LZ4_compressBound() : + Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) + This function is primarily useful for memory allocation purposes (destination buffer size). + Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example). + Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize) + inputSize : max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is incorrect (too large or negative) +*/ +LZ4LIB_API int LZ4_compressBound(int inputSize); + +/*! LZ4_compress_fast() : + Same as LZ4_compress_default(), but allows selection of "acceleration" factor. + The larger the acceleration value, the faster the algorithm, but also the lesser the compression. + It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed. + An acceleration value of "1" is the same as regular LZ4_compress_default() + Values <= 0 will be replaced by ACCELERATION_DEFAULT (currently == 1, see lz4.c). +*/ +LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + + +/*! LZ4_compress_fast_extState() : + * Same as LZ4_compress_fast(), using an externally allocated memory space for its state. + * Use LZ4_sizeofState() to know how much memory must be allocated, + * and allocate it on 8-bytes boundaries (using `malloc()` typically). + * Then, provide this buffer as `void* state` to compression function. + */ +LZ4LIB_API int LZ4_sizeofState(void); +LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + + +/*! LZ4_compress_destSize() : + * Reverse the logic : compresses as much data as possible from 'src' buffer + * into already allocated buffer 'dst', of size >= 'targetDestSize'. + * This function either compresses the entire 'src' content into 'dst' if it's large enough, + * or fill 'dst' buffer completely with as much data as possible from 'src'. + * note: acceleration parameter is fixed to "default". + * + * *srcSizePtr : will be modified to indicate how many bytes where read from 'src' to fill 'dst'. + * New value is necessarily <= input value. + * @return : Nb bytes written into 'dst' (necessarily <= targetDestSize) + * or 0 if compression fails. +*/ +LZ4LIB_API int LZ4_compress_destSize (const char* src, char* dst, int* srcSizePtr, int targetDstSize); + + +/*! LZ4_decompress_safe_partial() : + * Decompress an LZ4 compressed block, of size 'srcSize' at position 'src', + * into destination buffer 'dst' of size 'dstCapacity'. + * Up to 'targetOutputSize' bytes will be decoded. + * The function stops decoding on reaching this objective, + * which can boost performance when only the beginning of a block is required. + * + * @return : the number of bytes decoded in `dst` (necessarily <= dstCapacity) + * If source stream is detected malformed, function returns a negative result. + * + * Note : @return can be < targetOutputSize, if compressed block contains less data. + * + * Note 2 : this function features 2 parameters, targetOutputSize and dstCapacity, + * and expects targetOutputSize <= dstCapacity. + * It effectively stops decoding on reaching targetOutputSize, + * so dstCapacity is kind of redundant. + * This is because in a previous version of this function, + * decoding operation would not "break" a sequence in the middle. + * As a consequence, there was no guarantee that decoding would stop at exactly targetOutputSize, + * it could write more bytes, though only up to dstCapacity. + * Some "margin" used to be required for this operation to work properly. + * This is no longer necessary. + * The function nonetheless keeps its signature, in an effort to not break API. + */ +LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity); + + +/*-********************************************* +* Streaming Compression Functions +***********************************************/ +typedef union LZ4_stream_u LZ4_stream_t; /* incomplete type (defined later) */ + +LZ4LIB_API LZ4_stream_t* LZ4_createStream(void); +LZ4LIB_API int LZ4_freeStream (LZ4_stream_t* streamPtr); + +/*! LZ4_resetStream_fast() : v1.9.0+ + * Use this to prepare an LZ4_stream_t for a new chain of dependent blocks + * (e.g., LZ4_compress_fast_continue()). + * + * An LZ4_stream_t must be initialized once before usage. + * This is automatically done when created by LZ4_createStream(). + * However, should the LZ4_stream_t be simply declared on stack (for example), + * it's necessary to initialize it first, using LZ4_initStream(). + * + * After init, start any new stream with LZ4_resetStream_fast(). + * A same LZ4_stream_t can be re-used multiple times consecutively + * and compress multiple streams, + * provided that it starts each new stream with LZ4_resetStream_fast(). + * + * LZ4_resetStream_fast() is much faster than LZ4_initStream(), + * but is not compatible with memory regions containing garbage data. + * + * Note: it's only useful to call LZ4_resetStream_fast() + * in the context of streaming compression. + * The *extState* functions perform their own resets. + * Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive. + */ +LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr); + +/*! LZ4_loadDict() : + * Use this function to reference a static dictionary into LZ4_stream_t. + * The dictionary must remain available during compression. + * LZ4_loadDict() triggers a reset, so any previous data will be forgotten. + * The same dictionary will have to be loaded on decompression side for successful decoding. + * Dictionary are useful for better compression of small data (KB range). + * While LZ4 accept any input as dictionary, + * results are generally better when using Zstandard's Dictionary Builder. + * Loading a size of 0 is allowed, and is the same as reset. + * @return : loaded dictionary size, in bytes (necessarily <= 64 KB) + */ +LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); + +/*! LZ4_compress_fast_continue() : + * Compress 'src' content using data from previously compressed blocks, for better compression ratio. + * 'dst' buffer must be already allocated. + * If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. + * + * @return : size of compressed block + * or 0 if there is an error (typically, cannot fit into 'dst'). + * + * Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block. + * Each block has precise boundaries. + * Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata. + * It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together. + * + * Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory ! + * + * Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB. + * Make sure that buffers are separated, by at least one byte. + * This construction ensures that each block only depends on previous block. + * + * Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB. + * + * Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed. + */ +LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + +/*! LZ4_saveDict() : + * If last 64KB data cannot be guaranteed to remain available at its current memory location, + * save it into a safer place (char* safeBuffer). + * This is schematically equivalent to a memcpy() followed by LZ4_loadDict(), + * but is much faster, because LZ4_saveDict() doesn't need to rebuild tables. + * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error. + */ +LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize); + + +/*-********************************************** +* Streaming Decompression Functions +* Bufferless synchronous API +************************************************/ +typedef union LZ4_streamDecode_u LZ4_streamDecode_t; /* tracking context */ + +/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() : + * creation / destruction of streaming decompression tracking context. + * A tracking context can be re-used multiple times. + */ +LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void); +LZ4LIB_API int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); + +/*! LZ4_setStreamDecode() : + * An LZ4_streamDecode_t context can be allocated once and re-used multiple times. + * Use this function to start decompression of a new stream of blocks. + * A dictionary can optionally be set. Use NULL or size 0 for a reset order. + * Dictionary is presumed stable : it must remain accessible and unmodified during next decompression. + * @return : 1 if OK, 0 if error + */ +LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); + +/*! LZ4_decoderRingBufferSize() : v1.8.2+ + * Note : in a ring buffer scenario (optional), + * blocks are presumed decompressed next to each other + * up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize), + * at which stage it resumes from beginning of ring buffer. + * When setting such a ring buffer for streaming decompression, + * provides the minimum size of this ring buffer + * to be compatible with any source respecting maxBlockSize condition. + * @return : minimum ring buffer size, + * or 0 if there is an error (invalid maxBlockSize). + */ +LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize); +#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize)) /* for static allocation; maxBlockSize presumed valid */ + +/*! LZ4_decompress_*_continue() : + * These decoding functions allow decompression of consecutive blocks in "streaming" mode. + * A block is an unsplittable entity, it must be presented entirely to a decompression function. + * Decompression functions only accepts one block at a time. + * The last 64KB of previously decoded data *must* remain available and unmodified at the memory position where they were decoded. + * If less than 64KB of data has been decoded, all the data must be present. + * + * Special : if decompression side sets a ring buffer, it must respect one of the following conditions : + * - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize). + * maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes. + * In which case, encoding and decoding buffers do not need to be synchronized. + * Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize. + * - Synchronized mode : + * Decompression buffer size is _exactly_ the same as compression buffer size, + * and follows exactly same update rule (block boundaries at same positions), + * and decoding function is provided with exact decompressed size of each block (exception for last block of the stream), + * _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB). + * - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes. + * In which case, encoding and decoding buffers do not need to be synchronized, + * and encoding ring buffer can have any size, including small ones ( < 64 KB). + * + * Whenever these conditions are not possible, + * save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression, + * then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block. +*/ +LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity); + + +/*! LZ4_decompress_*_usingDict() : + * These decoding functions work the same as + * a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue() + * They are stand-alone, and don't need an LZ4_streamDecode_t structure. + * Dictionary is presumed stable : it must remain accessible and unmodified during decompression. + * Performance tip : Decompression speed can be substantially increased + * when dst == dictStart + dictSize. + */ +LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize); + + +/*^************************************* + * !!!!!! STATIC LINKING ONLY !!!!!! + ***************************************/ + +/*-**************************************************************************** + * Experimental section + * + * Symbols declared in this section must be considered unstable. Their + * signatures or semantics may change, or they may be removed altogether in the + * future. They are therefore only safe to depend on when the caller is + * statically linked against the library. + * + * To protect against unsafe usage, not only are the declarations guarded, + * the definitions are hidden by default + * when building LZ4 as a shared/dynamic library. + * + * In order to access these declarations, + * define LZ4_STATIC_LINKING_ONLY in your application + * before including LZ4's headers. + * + * In order to make their implementations accessible dynamically, you must + * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library. + ******************************************************************************/ + +#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS +#define LZ4LIB_STATIC_API LZ4LIB_API +#else +#define LZ4LIB_STATIC_API +#endif + +#ifdef LZ4_STATIC_LINKING_ONLY + + +/*! LZ4_compress_fast_extState_fastReset() : + * A variant of LZ4_compress_fast_extState(). + * + * Using this variant avoids an expensive initialization step. + * It is only safe to call if the state buffer is known to be correctly initialized already + * (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized"). + * From a high level, the difference is that + * this function initializes the provided state with a call to something like LZ4_resetStream_fast() + * while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream(). + */ +LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + +/*! LZ4_attach_dictionary() : + * This is an experimental API that allows + * efficient use of a static dictionary many times. + * + * Rather than re-loading the dictionary buffer into a working context before + * each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a + * working LZ4_stream_t, this function introduces a no-copy setup mechanism, + * in which the working stream references the dictionary stream in-place. + * + * Several assumptions are made about the state of the dictionary stream. + * Currently, only streams which have been prepared by LZ4_loadDict() should + * be expected to work. + * + * Alternatively, the provided dictionaryStream may be NULL, + * in which case any existing dictionary stream is unset. + * + * If a dictionary is provided, it replaces any pre-existing stream history. + * The dictionary contents are the only history that can be referenced and + * logically immediately precede the data compressed in the first subsequent + * compression call. + * + * The dictionary will only remain attached to the working stream through the + * first compression call, at the end of which it is cleared. The dictionary + * stream (and source buffer) must remain in-place / accessible / unchanged + * through the completion of the first compression call on the stream. + */ +LZ4LIB_STATIC_API void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream); + +#endif + + +/*-************************************************************ + * PRIVATE DEFINITIONS + ************************************************************** + * Do not use these definitions directly. + * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`. + * Accessing members will expose code to API and/or ABI break in future versions of the library. + **************************************************************/ +#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) +#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) +#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG) /* required as macro for static allocation */ + +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +#include <stdint.h> + +typedef struct LZ4_stream_t_internal LZ4_stream_t_internal; +struct LZ4_stream_t_internal { + uint32_t hashTable[LZ4_HASH_SIZE_U32]; + uint32_t currentOffset; + uint16_t dirty; + uint16_t tableType; + const uint8_t* dictionary; + const LZ4_stream_t_internal* dictCtx; + uint32_t dictSize; +}; + +typedef struct { + const uint8_t* externalDict; + size_t extDictSize; + const uint8_t* prefixEnd; + size_t prefixSize; +} LZ4_streamDecode_t_internal; + +#else + +typedef struct LZ4_stream_t_internal LZ4_stream_t_internal; +struct LZ4_stream_t_internal { + unsigned int hashTable[LZ4_HASH_SIZE_U32]; + unsigned int currentOffset; + unsigned short dirty; + unsigned short tableType; + const unsigned char* dictionary; + const LZ4_stream_t_internal* dictCtx; + unsigned int dictSize; +}; + +typedef struct { + const unsigned char* externalDict; + const unsigned char* prefixEnd; + size_t extDictSize; + size_t prefixSize; +} LZ4_streamDecode_t_internal; + +#endif + +/*! LZ4_stream_t : + * information structure to track an LZ4 stream. + * LZ4_stream_t can also be created using LZ4_createStream(), which is recommended. + * The structure definition can be convenient for static allocation + * (on stack, or as part of larger structure). + * Init this structure with LZ4_initStream() before first use. + * note : only use this definition in association with static linking ! + * this definition is not API/ABI safe, and may change in a future version. + */ +#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4 + ((sizeof(void*)==16) ? 4 : 0) /*AS-400*/ ) +#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(unsigned long long)) +union LZ4_stream_u { + unsigned long long table[LZ4_STREAMSIZE_U64]; + LZ4_stream_t_internal internal_donotuse; +} ; /* previously typedef'd to LZ4_stream_t */ + +/*! LZ4_initStream() : v1.9.0+ + * An LZ4_stream_t structure must be initialized at least once. + * This is automatically done when invoking LZ4_createStream(), + * but it's not when the structure is simply declared on stack (for example). + * + * Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t. + * It can also initialize any arbitrary buffer of sufficient size, + * and will @return a pointer of proper type upon initialization. + * + * Note : initialization fails if size and alignment conditions are not respected. + * In which case, the function will @return NULL. + * Note2: An LZ4_stream_t structure guarantees correct alignment and size. + * Note3: Before v1.9.0, use LZ4_resetStream() instead + */ +LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* buffer, size_t size); + + +/*! LZ4_streamDecode_t : + * information structure to track an LZ4 stream during decompression. + * init this structure using LZ4_setStreamDecode() before first use. + * note : only use in association with static linking ! + * this definition is not API/ABI safe, + * and may change in a future version ! + */ +#define LZ4_STREAMDECODESIZE_U64 (4 + ((sizeof(void*)==16) ? 2 : 0) /*AS-400*/ ) +#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) +union LZ4_streamDecode_u { + unsigned long long table[LZ4_STREAMDECODESIZE_U64]; + LZ4_streamDecode_t_internal internal_donotuse; +} ; /* previously typedef'd to LZ4_streamDecode_t */ + + +/*-************************************ +* Obsolete Functions +**************************************/ + +/*! Deprecation warnings + * + * Deprecated functions make the compiler generate a warning when invoked. + * This is meant to invite users to update their source code. + * Should deprecation warnings be a problem, it is generally possible to disable them, + * typically with -Wno-deprecated-declarations for gcc + * or _CRT_SECURE_NO_WARNINGS in Visual. + * + * Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS + * before including the header file. + */ +#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS +# define LZ4_DEPRECATED(message) /* disable deprecation warnings */ +#else +# define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ +# define LZ4_DEPRECATED(message) [[deprecated(message)]] +# elif (LZ4_GCC_VERSION >= 405) || defined(__clang__) +# define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) +# elif (LZ4_GCC_VERSION >= 301) +# define LZ4_DEPRECATED(message) __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define LZ4_DEPRECATED(message) __declspec(deprecated(message)) +# else +# pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler") +# define LZ4_DEPRECATED(message) +# endif +#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */ + +/* Obsolete compression functions */ +LZ4_DEPRECATED("use LZ4_compress_default() instead") LZ4LIB_API int LZ4_compress (const char* source, char* dest, int sourceSize); +LZ4_DEPRECATED("use LZ4_compress_default() instead") LZ4LIB_API int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); + +/* Obsolete decompression functions */ +LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize); +LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); + +/* Obsolete streaming functions; degraded functionality; do not use! + * + * In order to perform streaming compression, these functions depended on data + * that is no longer tracked in the state. They have been preserved as well as + * possible: using them will still produce a correct output. However, they don't + * actually retain any history between compression calls. The compression ratio + * achieved will therefore be no better than compressing each chunk + * independently. + */ +LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer); +LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int LZ4_sizeofStreamState(void); +LZ4_DEPRECATED("Use LZ4_resetStream() instead") LZ4LIB_API int LZ4_resetStreamState(void* state, char* inputBuffer); +LZ4_DEPRECATED("Use LZ4_saveDict() instead") LZ4LIB_API char* LZ4_slideInputBuffer (void* state); + +/* Obsolete streaming decoding functions */ +LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize); +LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize); + +/*! LZ4_decompress_fast() : **unsafe!** + * These functions used to be faster than LZ4_decompress_safe(), + * but it has changed, and they are now slower than LZ4_decompress_safe(). + * This is because LZ4_decompress_fast() doesn't know the input size, + * and therefore must progress more cautiously in the input buffer to not read beyond the end of block. + * On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability. + * As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated. + * + * The last remaining LZ4_decompress_fast() specificity is that + * it can decompress a block without knowing its compressed size. + * Such functionality could be achieved in a more secure manner, + * by also providing the maximum size of input buffer, + * but it would require new prototypes, and adaptation of the implementation to this new use case. + * + * Parameters: + * originalSize : is the uncompressed size to regenerate. + * `dst` must be already allocated, its size must be >= 'originalSize' bytes. + * @return : number of bytes read from source buffer (== compressed size). + * The function expects to finish at block's end exactly. + * If the source stream is detected malformed, the function stops decoding and returns a negative result. + * note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer. + * However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds. + * Also, since match offsets are not validated, match reads from 'src' may underflow too. + * These issues never happen if input (compressed) data is correct. + * But they may happen if input data is invalid (error or intentional tampering). + * As a consequence, use these functions in trusted environments with trusted data **only**. + */ + +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe() instead") +LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize); +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_continue() instead") +LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize); +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_usingDict() instead") +LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize); + +/*! LZ4_resetStream() : + * An LZ4_stream_t structure must be initialized at least once. + * This is done with LZ4_initStream(), or LZ4_resetStream(). + * Consider switching to LZ4_initStream(), + * invoking LZ4_resetStream() will trigger deprecation warnings in the future. + */ +LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr); + +} + +#endif /* LZ4_H_2983827168210 */ diff --git a/libs/tracy/common/tracy_sema.h b/libs/tracy/common/tracy_sema.h @@ -0,0 +1,255 @@ +// Copyright (c) 2015 Jeff Preshing +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. + +#ifndef __TRACY_CPP11OM_SEMAPHORE_H__ +#define __TRACY_CPP11OM_SEMAPHORE_H__ + +#include <atomic> +#include <cassert> + +#if defined(__MACH__) + #include <mach/mach.h> +#elif defined(__unix__) + #include <semaphore.h> +#endif + +namespace tracy +{ + +#if defined(_WIN32) +//--------------------------------------------------------- +// Semaphore (Windows) +//--------------------------------------------------------- +#ifndef MAXLONG +enum { MAXLONG = 0x7fffffff }; +#endif + +#ifndef INFINITE +enum { INFINITE = 0xFFFFFFFF }; +#endif + +#ifndef _WINDOWS_ +typedef void* HANDLE; + +extern "C" __declspec(dllimport) HANDLE __stdcall CreateSemaphoreA( void*, long, long, const char* ); +extern "C" __declspec(dllimport) int __stdcall CloseHandle( HANDLE ); +extern "C" __declspec(dllimport) unsigned long __stdcall WaitForSingleObject( HANDLE, unsigned long ); +extern "C" __declspec(dllimport) int __stdcall ReleaseSemaphore( HANDLE, long, long* ); +#endif + +class Semaphore +{ +private: + HANDLE m_hSema; + + Semaphore(const Semaphore& other) = delete; + Semaphore& operator=(const Semaphore& other) = delete; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + m_hSema = CreateSemaphoreA(NULL, initialCount, MAXLONG, NULL); + } + + ~Semaphore() + { + CloseHandle(m_hSema); + } + + void wait() + { + WaitForSingleObject(m_hSema, INFINITE); + } + + void signal(int count = 1) + { + ReleaseSemaphore(m_hSema, count, NULL); + } +}; + + +#elif defined(__MACH__) +//--------------------------------------------------------- +// Semaphore (Apple iOS and OSX) +// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html +//--------------------------------------------------------- + +class Semaphore +{ +private: + semaphore_t m_sema; + + Semaphore(const Semaphore& other) = delete; + Semaphore& operator=(const Semaphore& other) = delete; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount); + } + + ~Semaphore() + { + semaphore_destroy(mach_task_self(), m_sema); + } + + void wait() + { + semaphore_wait(m_sema); + } + + void signal() + { + semaphore_signal(m_sema); + } + + void signal(int count) + { + while (count-- > 0) + { + semaphore_signal(m_sema); + } + } +}; + + +#elif defined(__unix__) +//--------------------------------------------------------- +// Semaphore (POSIX, Linux) +//--------------------------------------------------------- + +class Semaphore +{ +private: + sem_t m_sema; + + Semaphore(const Semaphore& other) = delete; + Semaphore& operator=(const Semaphore& other) = delete; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + sem_init(&m_sema, 0, initialCount); + } + + ~Semaphore() + { + sem_destroy(&m_sema); + } + + void wait() + { + // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error + int rc; + do + { + rc = sem_wait(&m_sema); + } + while (rc == -1 && errno == EINTR); + } + + void signal() + { + sem_post(&m_sema); + } + + void signal(int count) + { + while (count-- > 0) + { + sem_post(&m_sema); + } + } +}; + + +#else + +#error Unsupported platform! + +#endif + + +//--------------------------------------------------------- +// LightweightSemaphore +//--------------------------------------------------------- +class LightweightSemaphore +{ +private: + std::atomic<int> m_count; + Semaphore m_sema; + + void waitWithPartialSpinning() + { + int oldCount; + // Is there a better way to set the initial spin count? + // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC, + // as threads start hitting the kernel semaphore. + int spin = 10000; + while (spin--) + { + oldCount = m_count.load(std::memory_order_relaxed); + if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire)) + return; + std::atomic_signal_fence(std::memory_order_acquire); // Prevent the compiler from collapsing the loop. + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount <= 0) + { + m_sema.wait(); + } + } + +public: + LightweightSemaphore(int initialCount = 0) : m_count(initialCount) + { + assert(initialCount >= 0); + } + + bool tryWait() + { + int oldCount = m_count.load(std::memory_order_relaxed); + return (oldCount > 0 && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire)); + } + + void wait() + { + if (!tryWait()) + waitWithPartialSpinning(); + } + + void signal(int count = 1) + { + int oldCount = m_count.fetch_add(count, std::memory_order_release); + int toRelease = -oldCount < count ? -oldCount : count; + if (toRelease > 0) + { + m_sema.signal(toRelease); + } + } +}; + + +typedef LightweightSemaphore DefaultSemaphoreType; + +} + +#endif // __CPP11OM_SEMAPHORE_H__ diff --git a/make.lua b/make.lua @@ -1,5 +1,7 @@ require( "ggbuild.gen_ninja" ) +require( "libs.tracy" ) + obj_cxxflags( ".*", "-I source -I libs" ) msvc_obj_cxxflags( ".*", "/W4 /wd4100 /wd4146 /wd4189 /wd4201 /wd4307 /wd4324 /wd4351 /wd4127 /wd4505 /wd4530 /wd4702 /wd4706 /D_CRT_SECURE_NO_WARNINGS" ) @@ -11,6 +13,10 @@ gcc_obj_cxxflags( ".*", "-Wall -Wextra -Wcast-align -Wvla -Wformat-security" ) - gcc_obj_cxxflags( ".*", "-Wno-unused-parameter -Wno-missing-field-initializers -Wno-implicit-fallthrough -Wno-format-truncation" ) gcc_obj_cxxflags( ".*", "-Werror=vla -Werror=format-security -Werror=unused-value" ) +if config ~= "release" then + obj_cxxflags( ".*", "-DTRACY_ENABLE" ) +end + local platform_srcs, platform_libs if OS == "windows" then @@ -31,7 +37,7 @@ bin( "mudgangster", { "src/ui.cc", "src/script.cc", "src/textbox.cc", "src/input.cc", "src/platform_network.cc", }, - libs = platform_libs, + libs = { platform_libs, "tracy" }, rc = "src/rc", diff --git a/src/common.h b/src/common.h @@ -9,6 +9,8 @@ #include "config.h" +#include "tracy/Tracy.hpp" + typedef uint8_t u8; typedef uint16_t u16; typedef uint32_t u32;