mudgangster

Tiny, scriptable MUD client
Log | Files | Refs | README

TracyDxt1.cpp (36744B)


      1 #include "TracyDxt1.hpp"
      2 #include "../common/TracyForceInline.hpp"
      3 
      4 #include <assert.h>
      5 #include <stdint.h>
      6 #include <string.h>
      7 
      8 #ifdef __ARM_NEON
      9 #  include <arm_neon.h>
     10 #endif
     11 
     12 #if defined __AVX__ && !defined __SSE4_1__
     13 #  define __SSE4_1__
     14 #endif
     15 
     16 #if defined __SSE4_1__ || defined __AVX2__
     17 #  ifdef _MSC_VER
     18 #    include <intrin.h>
     19 #  else
     20 #    include <x86intrin.h>
     21 #    ifdef __CYGWIN__
     22 #      ifndef _mm256_cvtsi256_si32
     23 #        define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) )
     24 #      endif
     25 #    endif
     26 #  endif
     27 #endif
     28 
     29 namespace tracy
     30 {
     31 
     32 static inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b )
     33 {
     34     return ( ( r & 0xF8 ) << 8 ) | ( ( g & 0xFC ) << 3 ) | ( b >> 3 );
     35 }
     36 
     37 static inline uint16_t to565( uint32_t c )
     38 {
     39     return
     40         ( ( c & 0xF80000 ) >> 19 ) |
     41         ( ( c & 0x00FC00 ) >> 5 ) |
     42         ( ( c & 0x0000F8 ) << 8 );
     43 }
     44 
     45 static const uint16_t DivTable[255*3+1] = {
     46     0xffff, 0xffff, 0xffff, 0xffff, 0xcccc, 0xaaaa, 0x9249, 0x8000, 0x71c7, 0x6666, 0x5d17, 0x5555, 0x4ec4, 0x4924, 0x4444, 0x4000,
     47     0x3c3c, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
     48     0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
     49     0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
     50     0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
     51     0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
     52     0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
     53     0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
     54     0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
     55     0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
     56     0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
     57     0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
     58     0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
     59     0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
     60     0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
     61     0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
     62     0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
     63     0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
     64     0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
     65     0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
     66     0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
     67     0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
     68     0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
     69     0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
     70     0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
     71     0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
     72     0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
     73     0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
     74     0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
     75     0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
     76     0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
     77     0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
     78     0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
     79     0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
     80     0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
     81     0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
     82     0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
     83     0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
     84     0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
     85     0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
     86     0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
     87     0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
     88     0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
     89     0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
     90     0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
     91     0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
     92     0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
     93     0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
     94 };
     95 static const uint16_t DivTableAVX[255*3+1] = {
     96     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
     97     0x0000, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
     98     0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
     99     0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
    100     0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
    101     0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
    102     0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
    103     0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
    104     0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
    105     0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
    106     0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
    107     0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
    108     0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
    109     0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
    110     0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
    111     0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
    112     0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
    113     0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
    114     0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
    115     0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
    116     0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
    117     0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
    118     0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
    119     0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
    120     0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
    121     0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
    122     0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
    123     0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
    124     0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
    125     0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
    126     0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
    127     0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
    128     0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
    129     0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
    130     0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
    131     0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
    132     0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
    133     0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
    134     0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
    135     0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
    136     0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
    137     0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
    138     0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
    139     0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
    140     0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
    141     0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
    142     0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
    143     0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
    144 };
    145 static const uint16_t DivTableNEON[255*3+1] = {
    146     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
    147     0x0000, 0x1c71, 0x1af2, 0x1999, 0x1861, 0x1745, 0x1642, 0x1555, 0x147a, 0x13b1, 0x12f6, 0x1249, 0x11a7, 0x1111, 0x1084, 0x1000,
    148     0x0f83, 0x0f0f, 0x0ea0, 0x0e38, 0x0dd6, 0x0d79, 0x0d20, 0x0ccc, 0x0c7c, 0x0c30, 0x0be8, 0x0ba2, 0x0b60, 0x0b21, 0x0ae4, 0x0aaa,
    149     0x0a72, 0x0a3d, 0x0a0a, 0x09d8, 0x09a9, 0x097b, 0x094f, 0x0924, 0x08fb, 0x08d3, 0x08ad, 0x0888, 0x0864, 0x0842, 0x0820, 0x0800,
    150     0x07e0, 0x07c1, 0x07a4, 0x0787, 0x076b, 0x0750, 0x0736, 0x071c, 0x0703, 0x06eb, 0x06d3, 0x06bc, 0x06a6, 0x0690, 0x067b, 0x0666,
    151     0x0652, 0x063e, 0x062b, 0x0618, 0x0606, 0x05f4, 0x05e2, 0x05d1, 0x05c0, 0x05b0, 0x05a0, 0x0590, 0x0581, 0x0572, 0x0563, 0x0555,
    152     0x0547, 0x0539, 0x052b, 0x051e, 0x0511, 0x0505, 0x04f8, 0x04ec, 0x04e0, 0x04d4, 0x04c8, 0x04bd, 0x04b2, 0x04a7, 0x049c, 0x0492,
    153     0x0487, 0x047d, 0x0473, 0x0469, 0x0460, 0x0456, 0x044d, 0x0444, 0x043b, 0x0432, 0x0429, 0x0421, 0x0418, 0x0410, 0x0408, 0x0400,
    154     0x03f8, 0x03f0, 0x03e8, 0x03e0, 0x03d9, 0x03d2, 0x03ca, 0x03c3, 0x03bc, 0x03b5, 0x03ae, 0x03a8, 0x03a1, 0x039b, 0x0394, 0x038e,
    155     0x0387, 0x0381, 0x037b, 0x0375, 0x036f, 0x0369, 0x0364, 0x035e, 0x0358, 0x0353, 0x034d, 0x0348, 0x0342, 0x033d, 0x0338, 0x0333,
    156     0x032e, 0x0329, 0x0324, 0x031f, 0x031a, 0x0315, 0x0310, 0x030c, 0x0307, 0x0303, 0x02fe, 0x02fa, 0x02f5, 0x02f1, 0x02ec, 0x02e8,
    157     0x02e4, 0x02e0, 0x02dc, 0x02d8, 0x02d4, 0x02d0, 0x02cc, 0x02c8, 0x02c4, 0x02c0, 0x02bc, 0x02b9, 0x02b5, 0x02b1, 0x02ae, 0x02aa,
    158     0x02a7, 0x02a3, 0x02a0, 0x029c, 0x0299, 0x0295, 0x0292, 0x028f, 0x028c, 0x0288, 0x0285, 0x0282, 0x027f, 0x027c, 0x0279, 0x0276,
    159     0x0273, 0x0270, 0x026d, 0x026a, 0x0267, 0x0264, 0x0261, 0x025e, 0x025c, 0x0259, 0x0256, 0x0253, 0x0251, 0x024e, 0x024b, 0x0249,
    160     0x0246, 0x0243, 0x0241, 0x023e, 0x023c, 0x0239, 0x0237, 0x0234, 0x0232, 0x0230, 0x022d, 0x022b, 0x0229, 0x0226, 0x0224, 0x0222,
    161     0x021f, 0x021d, 0x021b, 0x0219, 0x0216, 0x0214, 0x0212, 0x0210, 0x020e, 0x020c, 0x020a, 0x0208, 0x0206, 0x0204, 0x0202, 0x0200,
    162     0x01fe, 0x01fc, 0x01fa, 0x01f8, 0x01f6, 0x01f4, 0x01f2, 0x01f0, 0x01ee, 0x01ec, 0x01ea, 0x01e9, 0x01e7, 0x01e5, 0x01e3, 0x01e1,
    163     0x01e0, 0x01de, 0x01dc, 0x01da, 0x01d9, 0x01d7, 0x01d5, 0x01d4, 0x01d2, 0x01d0, 0x01cf, 0x01cd, 0x01cb, 0x01ca, 0x01c8, 0x01c7,
    164     0x01c5, 0x01c3, 0x01c2, 0x01c0, 0x01bf, 0x01bd, 0x01bc, 0x01ba, 0x01b9, 0x01b7, 0x01b6, 0x01b4, 0x01b3, 0x01b2, 0x01b0, 0x01af,
    165     0x01ad, 0x01ac, 0x01aa, 0x01a9, 0x01a8, 0x01a6, 0x01a5, 0x01a4, 0x01a2, 0x01a1, 0x01a0, 0x019e, 0x019d, 0x019c, 0x019a, 0x0199,
    166     0x0198, 0x0197, 0x0195, 0x0194, 0x0193, 0x0192, 0x0190, 0x018f, 0x018e, 0x018d, 0x018b, 0x018a, 0x0189, 0x0188, 0x0187, 0x0186,
    167     0x0184, 0x0183, 0x0182, 0x0181, 0x0180, 0x017f, 0x017e, 0x017d, 0x017b, 0x017a, 0x0179, 0x0178, 0x0177, 0x0176, 0x0175, 0x0174,
    168     0x0173, 0x0172, 0x0171, 0x0170, 0x016f, 0x016e, 0x016d, 0x016c, 0x016b, 0x016a, 0x0169, 0x0168, 0x0167, 0x0166, 0x0165, 0x0164,
    169     0x0163, 0x0162, 0x0161, 0x0160, 0x015f, 0x015e, 0x015d, 0x015c, 0x015b, 0x015a, 0x0159, 0x0158, 0x0158, 0x0157, 0x0156, 0x0155,
    170     0x0154, 0x0153, 0x0152, 0x0151, 0x0150, 0x0150, 0x014f, 0x014e, 0x014d, 0x014c, 0x014b, 0x014a, 0x014a, 0x0149, 0x0148, 0x0147,
    171     0x0146, 0x0146, 0x0145, 0x0144, 0x0143, 0x0142, 0x0142, 0x0141, 0x0140, 0x013f, 0x013e, 0x013e, 0x013d, 0x013c, 0x013b, 0x013b,
    172     0x013a, 0x0139, 0x0138, 0x0138, 0x0137, 0x0136, 0x0135, 0x0135, 0x0134, 0x0133, 0x0132, 0x0132, 0x0131, 0x0130, 0x0130, 0x012f,
    173     0x012e, 0x012e, 0x012d, 0x012c, 0x012b, 0x012b, 0x012a, 0x0129, 0x0129, 0x0128, 0x0127, 0x0127, 0x0126, 0x0125, 0x0125, 0x0124,
    174     0x0123, 0x0123, 0x0122, 0x0121, 0x0121, 0x0120, 0x0120, 0x011f, 0x011e, 0x011e, 0x011d, 0x011c, 0x011c, 0x011b, 0x011b, 0x011a,
    175     0x0119, 0x0119, 0x0118, 0x0118, 0x0117, 0x0116, 0x0116, 0x0115, 0x0115, 0x0114, 0x0113, 0x0113, 0x0112, 0x0112, 0x0111, 0x0111,
    176     0x0110, 0x010f, 0x010f, 0x010e, 0x010e, 0x010d, 0x010d, 0x010c, 0x010c, 0x010b, 0x010a, 0x010a, 0x0109, 0x0109, 0x0108, 0x0108,
    177     0x0107, 0x0107, 0x0106, 0x0106, 0x0105, 0x0105, 0x0104, 0x0104, 0x0103, 0x0103, 0x0102, 0x0102, 0x0101, 0x0101, 0x0100, 0x0100,
    178     0x00ff, 0x00ff, 0x00fe, 0x00fe, 0x00fd, 0x00fd, 0x00fc, 0x00fc, 0x00fb, 0x00fb, 0x00fa, 0x00fa, 0x00f9, 0x00f9, 0x00f8, 0x00f8,
    179     0x00f7, 0x00f7, 0x00f6, 0x00f6, 0x00f5, 0x00f5, 0x00f4, 0x00f4, 0x00f4, 0x00f3, 0x00f3, 0x00f2, 0x00f2, 0x00f1, 0x00f1, 0x00f0,
    180     0x00f0, 0x00f0, 0x00ef, 0x00ef, 0x00ee, 0x00ee, 0x00ed, 0x00ed, 0x00ed, 0x00ec, 0x00ec, 0x00eb, 0x00eb, 0x00ea, 0x00ea, 0x00ea,
    181     0x00e9, 0x00e9, 0x00e8, 0x00e8, 0x00e7, 0x00e7, 0x00e7, 0x00e6, 0x00e6, 0x00e5, 0x00e5, 0x00e5, 0x00e4, 0x00e4, 0x00e3, 0x00e3,
    182     0x00e3, 0x00e2, 0x00e2, 0x00e1, 0x00e1, 0x00e1, 0x00e0, 0x00e0, 0x00e0, 0x00df, 0x00df, 0x00de, 0x00de, 0x00de, 0x00dd, 0x00dd,
    183     0x00dd, 0x00dc, 0x00dc, 0x00db, 0x00db, 0x00db, 0x00da, 0x00da, 0x00da, 0x00d9, 0x00d9, 0x00d9, 0x00d8, 0x00d8, 0x00d7, 0x00d7,
    184     0x00d7, 0x00d6, 0x00d6, 0x00d6, 0x00d5, 0x00d5, 0x00d5, 0x00d4, 0x00d4, 0x00d4, 0x00d3, 0x00d3, 0x00d3, 0x00d2, 0x00d2, 0x00d2,
    185     0x00d1, 0x00d1, 0x00d1, 0x00d0, 0x00d0, 0x00d0, 0x00cf, 0x00cf, 0x00cf, 0x00ce, 0x00ce, 0x00ce, 0x00cd, 0x00cd, 0x00cd, 0x00cc,
    186     0x00cc, 0x00cc, 0x00cb, 0x00cb, 0x00cb, 0x00ca, 0x00ca, 0x00ca, 0x00c9, 0x00c9, 0x00c9, 0x00c9, 0x00c8, 0x00c8, 0x00c8, 0x00c7,
    187     0x00c7, 0x00c7, 0x00c6, 0x00c6, 0x00c6, 0x00c5, 0x00c5, 0x00c5, 0x00c5, 0x00c4, 0x00c4, 0x00c4, 0x00c3, 0x00c3, 0x00c3, 0x00c3,
    188     0x00c2, 0x00c2, 0x00c2, 0x00c1, 0x00c1, 0x00c1, 0x00c1, 0x00c0, 0x00c0, 0x00c0, 0x00bf, 0x00bf, 0x00bf, 0x00bf, 0x00be, 0x00be,
    189     0x00be, 0x00bd, 0x00bd, 0x00bd, 0x00bd, 0x00bc, 0x00bc, 0x00bc, 0x00bc, 0x00bb, 0x00bb, 0x00bb, 0x00ba, 0x00ba, 0x00ba, 0x00ba,
    190     0x00b9, 0x00b9, 0x00b9, 0x00b9, 0x00b8, 0x00b8, 0x00b8, 0x00b8, 0x00b7, 0x00b7, 0x00b7, 0x00b7, 0x00b6, 0x00b6, 0x00b6, 0x00b6,
    191     0x00b5, 0x00b5, 0x00b5, 0x00b5, 0x00b4, 0x00b4, 0x00b4, 0x00b4, 0x00b3, 0x00b3, 0x00b3, 0x00b3, 0x00b2, 0x00b2, 0x00b2, 0x00b2,
    192     0x00b1, 0x00b1, 0x00b1, 0x00b1, 0x00b0, 0x00b0, 0x00b0, 0x00b0, 0x00af, 0x00af, 0x00af, 0x00af, 0x00ae, 0x00ae, 0x00ae, 0x00ae,
    193     0x00ae, 0x00ad, 0x00ad, 0x00ad, 0x00ad, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ab, 0x00ab, 0x00ab, 0x00ab,
    194 };
    195 
    196 
    197 static tracy_force_inline uint64_t ProcessRGB( const uint8_t* src )
    198 {
    199 #ifdef __SSE4_1__
    200     __m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
    201     __m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
    202     __m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
    203     __m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
    204 
    205     __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
    206     __m128i sd0 = _mm_and_si128( px0, smask );
    207     __m128i sd1 = _mm_and_si128( px1, smask );
    208     __m128i sd2 = _mm_and_si128( px2, smask );
    209     __m128i sd3 = _mm_and_si128( px3, smask );
    210 
    211     __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
    212 
    213     __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
    214     __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
    215     __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
    216     __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
    217 
    218     __m128i sm0 = _mm_and_si128(sc0, sc1);
    219     __m128i sm1 = _mm_and_si128(sc2, sc3);
    220     __m128i sm = _mm_and_si128(sm0, sm1);
    221 
    222     if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
    223     {
    224         return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
    225     }
    226 
    227     __m128i min0 = _mm_min_epu8( px0, px1 );
    228     __m128i min1 = _mm_min_epu8( px2, px3 );
    229     __m128i min2 = _mm_min_epu8( min0, min1 );
    230 
    231     __m128i max0 = _mm_max_epu8( px0, px1 );
    232     __m128i max1 = _mm_max_epu8( px2, px3 );
    233     __m128i max2 = _mm_max_epu8( max0, max1 );
    234 
    235     __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
    236     __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
    237     __m128i min4 = _mm_min_epu8( min2, min3 );
    238     __m128i max4 = _mm_max_epu8( max2, max3 );
    239 
    240     __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
    241     __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
    242     __m128i rmin = _mm_min_epu8( min4, min5 );
    243     __m128i rmax = _mm_max_epu8( max4, max5 );
    244 
    245     __m128i range1 = _mm_subs_epu8( rmax, rmin );
    246     __m128i range2 = _mm_sad_epu8( rmax, rmin );
    247 
    248     uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
    249     __m128i range = _mm_set1_epi16( DivTable[vrange] );
    250 
    251     __m128i inset1 = _mm_srli_epi16( range1, 4 );
    252     __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
    253     __m128i min = _mm_adds_epu8( rmin, inset );
    254     __m128i max = _mm_subs_epu8( rmax, inset );
    255 
    256     __m128i c0 = _mm_subs_epu8( px0, rmin );
    257     __m128i c1 = _mm_subs_epu8( px1, rmin );
    258     __m128i c2 = _mm_subs_epu8( px2, rmin );
    259     __m128i c3 = _mm_subs_epu8( px3, rmin );
    260 
    261     __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
    262     __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
    263     __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
    264     __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
    265 
    266     __m128i s0 = _mm_hadd_epi16( is0, is1 );
    267     __m128i s1 = _mm_hadd_epi16( is2, is3 );
    268 
    269     __m128i m0 = _mm_mulhi_epu16( s0, range );
    270     __m128i m1 = _mm_mulhi_epu16( s1, range );
    271 
    272     __m128i p0 = _mm_packus_epi16( m0, m1 );
    273 
    274     __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
    275     __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
    276     __m128i p3 = _mm_or_si128( p1, p2 );
    277     __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
    278 
    279     uint32_t vmin = _mm_cvtsi128_si32( min );
    280     uint32_t vmax = _mm_cvtsi128_si32( max );
    281     uint32_t vp = _mm_cvtsi128_si32( p );
    282 
    283     return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
    284 #elif defined __ARM_NEON
    285 #  ifdef __aarch64__
    286     uint8x16x4_t px = vld4q_u8( src );
    287 
    288     uint8x16_t lr = px.val[0];
    289     uint8x16_t lg = px.val[1];
    290     uint8x16_t lb = px.val[2];
    291 
    292     uint8_t rmaxr = vmaxvq_u8( lr );
    293     uint8_t rmaxg = vmaxvq_u8( lg );
    294     uint8_t rmaxb = vmaxvq_u8( lb );
    295 
    296     uint8_t rminr = vminvq_u8( lr );
    297     uint8_t rming = vminvq_u8( lg );
    298     uint8_t rminb = vminvq_u8( lb );
    299 
    300     int rr = rmaxr - rminr;
    301     int rg = rmaxg - rming;
    302     int rb = rmaxb - rminb;
    303 
    304     int vrange1 = rr + rg + rb;
    305     uint16_t vrange2 = DivTableNEON[vrange1];
    306 
    307     uint8_t insetr = rr >> 4;
    308     uint8_t insetg = rg >> 4;
    309     uint8_t insetb = rb >> 4;
    310 
    311     uint8_t minr = rminr + insetr;
    312     uint8_t ming = rming + insetg;
    313     uint8_t minb = rminb + insetb;
    314 
    315     uint8_t maxr = rmaxr - insetr;
    316     uint8_t maxg = rmaxg - insetg;
    317     uint8_t maxb = rmaxb - insetb;
    318 
    319     uint8x16_t cr = vsubq_u8( lr, vdupq_n_u8( rminr ) );
    320     uint8x16_t cg = vsubq_u8( lg, vdupq_n_u8( rming ) );
    321     uint8x16_t cb = vsubq_u8( lb, vdupq_n_u8( rminb ) );
    322 
    323     uint16x8_t is0l = vaddl_u8( vget_low_u8( cr ), vget_low_u8( cg ) );
    324     uint16x8_t is0h = vaddl_u8( vget_high_u8( cr ), vget_high_u8( cg ) );
    325     uint16x8_t is1l = vaddw_u8( is0l, vget_low_u8( cb ) );
    326     uint16x8_t is1h = vaddw_u8( is0h, vget_high_u8( cb ) );
    327 
    328     int16x8_t range = vdupq_n_s16( vrange2 );
    329     uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1l ), range ) );
    330     uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1h ), range ) );
    331 
    332     uint8x8_t p00 = vmovn_u16( m0 );
    333     uint8x8_t p01 = vmovn_u16( m1 );
    334     uint8x16_t p0 = vcombine_u8( p00, p01 );
    335 
    336     uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
    337     uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
    338     uint32x4_t p3 = vaddq_u32( p1, p2 );
    339 
    340     uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
    341     uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
    342 
    343     uint32_t vp;
    344     vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
    345 
    346     return uint64_t( ( uint64_t( to565( minr, ming, minb ) ) << 16 ) | to565( maxr, maxg, maxb ) | ( uint64_t( vp ) << 32 ) );
    347 #  else
    348     uint32x4_t px0 = vld1q_u32( (uint32_t*)src );
    349     uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 );
    350     uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 );
    351     uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 );
    352 
    353     uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 );
    354     uint32x4_t sd0 = vandq_u32( smask, px0 );
    355     uint32x4_t sd1 = vandq_u32( smask, px1 );
    356     uint32x4_t sd2 = vandq_u32( smask, px2 );
    357     uint32x4_t sd3 = vandq_u32( smask, px3 );
    358 
    359     uint32x4_t sc = vdupq_n_u32( sd0[0] );
    360 
    361     uint32x4_t sc0 = vceqq_u32( sd0, sc );
    362     uint32x4_t sc1 = vceqq_u32( sd1, sc );
    363     uint32x4_t sc2 = vceqq_u32( sd2, sc );
    364     uint32x4_t sc3 = vceqq_u32( sd3, sc );
    365 
    366     uint32x4_t sm0 = vandq_u32( sc0, sc1 );
    367     uint32x4_t sm1 = vandq_u32( sc2, sc3 );
    368     int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) );
    369 
    370     if( sm[0] == -1 && sm[1] == -1 )
    371     {
    372         return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
    373     }
    374 
    375     uint32x4_t mask = vdupq_n_u32( 0xFFFFFF );
    376     uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) );
    377     uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) );
    378     uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) );
    379     uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) );
    380 
    381     uint8x16_t min0 = vminq_u8( l0, l1 );
    382     uint8x16_t min1 = vminq_u8( l2, l3 );
    383     uint8x16_t min2 = vminq_u8( min0, min1 );
    384 
    385     uint8x16_t max0 = vmaxq_u8( l0, l1 );
    386     uint8x16_t max1 = vmaxq_u8( l2, l3 );
    387     uint8x16_t max2 = vmaxq_u8( max0, max1 );
    388 
    389     uint8x16_t min3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( min2 ) ) );
    390     uint8x16_t max3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( max2 ) ) );
    391 
    392     uint8x16_t min4 = vminq_u8( min2, min3 );
    393     uint8x16_t max4 = vmaxq_u8( max2, max3 );
    394 
    395     uint8x16_t min5 = vcombine_u8( vget_high_u8( min4 ), vget_low_u8( min4 ) );
    396     uint8x16_t max5 = vcombine_u8( vget_high_u8( max4 ), vget_low_u8( max4 ) );
    397 
    398     uint8x16_t rmin = vminq_u8( min4, min5 );
    399     uint8x16_t rmax = vmaxq_u8( max4, max5 );
    400 
    401     uint8x16_t range1 = vsubq_u8( rmax, rmin );
    402     uint8x8_t range2 = vget_low_u8( range1 );
    403     uint8x8x2_t range3 = vzip_u8( range2, vdup_n_u8( 0 ) );
    404     uint16x4_t range4 = vreinterpret_u16_u8( range3.val[0] );
    405 
    406     uint16_t vrange1;
    407     uint16x4_t range5 = vpadd_u16( range4, range4 );
    408     uint16x4_t range6 = vpadd_u16( range5, range5 );
    409     vst1_lane_u16( &vrange1, range6, 0 );
    410 
    411     uint32_t vrange2 = ( 2 << 16 ) / uint32_t( vrange1 + 1 );
    412     uint16x8_t range = vdupq_n_u16( vrange2 );
    413 
    414     uint8x16_t inset = vshrq_n_u8( range1, 4 );
    415     uint8x16_t min = vaddq_u8( rmin, inset );
    416     uint8x16_t max = vsubq_u8( rmax, inset );
    417 
    418     uint8x16_t c0 = vsubq_u8( l0, rmin );
    419     uint8x16_t c1 = vsubq_u8( l1, rmin );
    420     uint8x16_t c2 = vsubq_u8( l2, rmin );
    421     uint8x16_t c3 = vsubq_u8( l3, rmin );
    422 
    423     uint16x8_t is0 = vpaddlq_u8( c0 );
    424     uint16x8_t is1 = vpaddlq_u8( c1 );
    425     uint16x8_t is2 = vpaddlq_u8( c2 );
    426     uint16x8_t is3 = vpaddlq_u8( c3 );
    427 
    428     uint16x4_t is4 = vpadd_u16( vget_low_u16( is0 ), vget_high_u16( is0 ) );
    429     uint16x4_t is5 = vpadd_u16( vget_low_u16( is1 ), vget_high_u16( is1 ) );
    430     uint16x4_t is6 = vpadd_u16( vget_low_u16( is2 ), vget_high_u16( is2 ) );
    431     uint16x4_t is7 = vpadd_u16( vget_low_u16( is3 ), vget_high_u16( is3 ) );
    432 
    433     uint16x8_t s0 = vcombine_u16( is4, is5 );
    434     uint16x8_t s1 = vcombine_u16( is6, is7 );
    435 
    436     uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s0 ), vreinterpretq_s16_u16( range ) ) );
    437     uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s1 ), vreinterpretq_s16_u16( range ) ) );
    438 
    439     uint8x8_t p00 = vmovn_u16( m0 );
    440     uint8x8_t p01 = vmovn_u16( m1 );
    441     uint8x16_t p0 = vcombine_u8( p00, p01 );
    442 
    443     uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
    444     uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
    445     uint32x4_t p3 = vaddq_u32( p1, p2 );
    446 
    447     uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
    448     uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
    449 
    450     uint32_t vmin, vmax, vp;
    451     vst1q_lane_u32( &vmin, vreinterpretq_u32_u8( min ), 0 );
    452     vst1q_lane_u32( &vmax, vreinterpretq_u32_u8( max ), 0 );
    453     vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
    454 
    455     return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
    456 #  endif
    457 #else
    458     const auto ref = to565( src[0], src[1], src[2] );
    459     auto stmp = src + 4;
    460     for( int i=1; i<16; i++ )
    461     {
    462         if( to565( stmp[0], stmp[1], stmp[2] ) != ref )
    463         {
    464             break;
    465         }
    466         stmp += 4;
    467     }
    468     if( stmp == src + 64 )
    469     {
    470         return uint64_t( ref ) << 16;
    471     }
    472 
    473     uint8_t min[3] = { src[0], src[1], src[2] };
    474     uint8_t max[3] = { src[0], src[1], src[2] };
    475     auto tmp = src + 4;
    476     for( int i=1; i<16; i++ )
    477     {
    478         for( int j=0; j<3; j++ )
    479         {
    480             if( tmp[j] < min[j] ) min[j] = tmp[j];
    481             else if( tmp[j] > max[j] ) max[j] = tmp[j];
    482         }
    483         tmp += 4;
    484     }
    485 
    486     const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]];
    487     const uint32_t rmin = min[0] + min[1] + min[2];
    488     for( int i=0; i<3; i++ )
    489     {
    490         const uint8_t inset = ( max[i] - min[i] ) >> 4;
    491         min[i] += inset;
    492         max[i] -= inset;
    493     }
    494 
    495     uint32_t data = 0;
    496     for( int i=0; i<16; i++ )
    497     {
    498         const uint32_t c = src[0] + src[1] + src[2] - rmin;
    499         const uint8_t idx = ( c * range ) >> 16;
    500         data |= idx << (i*2);
    501         src += 4;
    502     }
    503 
    504     return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) );
    505 #endif
    506 }
    507 
    508 #ifdef __AVX2__
    509 static tracy_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst )
    510 {
    511     __m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0);
    512     __m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1);
    513     __m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2);
    514     __m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3);
    515 
    516     __m256i min0 = _mm256_min_epu8( px0, px1 );
    517     __m256i min1 = _mm256_min_epu8( px2, px3 );
    518     __m256i min2 = _mm256_min_epu8( min0, min1 );
    519 
    520     __m256i max0 = _mm256_max_epu8( px0, px1 );
    521     __m256i max1 = _mm256_max_epu8( px2, px3 );
    522     __m256i max2 = _mm256_max_epu8( max0, max1 );
    523 
    524     __m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
    525     __m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
    526     __m256i min4 = _mm256_min_epu8( min2, min3 );
    527     __m256i max4 = _mm256_max_epu8( max2, max3 );
    528 
    529     __m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
    530     __m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
    531     __m256i rmin = _mm256_min_epu8( min4, min5 );
    532     __m256i rmax = _mm256_max_epu8( max4, max5 );
    533 
    534     __m256i range1 = _mm256_subs_epu8( rmax, rmin );
    535     __m256i range2 = _mm256_sad_epu8( rmax, rmin );
    536 
    537     uint16_t vrange0 = DivTableAVX[_mm256_cvtsi256_si32( range2 ) >> 1];
    538     uint16_t vrange1 = DivTableAVX[_mm256_extract_epi16( range2, 8 ) >> 1];
    539     __m256i range00 = _mm256_set1_epi16( vrange0 );
    540     __m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 );
    541 
    542     __m256i inset1 = _mm256_srli_epi16( range1, 4 );
    543     __m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) );
    544     __m256i min = _mm256_adds_epu8( rmin, inset );
    545     __m256i max = _mm256_subs_epu8( rmax, inset );
    546 
    547     __m256i c0 = _mm256_subs_epu8( px0, rmin );
    548     __m256i c1 = _mm256_subs_epu8( px1, rmin );
    549     __m256i c2 = _mm256_subs_epu8( px2, rmin );
    550     __m256i c3 = _mm256_subs_epu8( px3, rmin );
    551 
    552     __m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) );
    553     __m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) );
    554     __m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) );
    555     __m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) );
    556 
    557     __m256i s0 = _mm256_hadd_epi16( is0, is1 );
    558     __m256i s1 = _mm256_hadd_epi16( is2, is3 );
    559 
    560     __m256i m0 = _mm256_mulhi_epu16( s0, range );
    561     __m256i m1 = _mm256_mulhi_epu16( s1, range );
    562 
    563     __m256i p0 = _mm256_packus_epi16( m0, m1 );
    564 
    565     __m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) );
    566     __m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 );
    567     __m256i p3 = _mm256_or_si256( p1, p2 );
    568     __m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) );
    569 
    570     __m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min );
    571     __m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max );
    572     __m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 );
    573     __m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 );
    574     __m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 );
    575     __m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 );
    576     __m256i mm3 = _mm256_or_si256( mmr, mmg );
    577     __m256i mm4 = _mm256_or_si256( mm3, mmb );
    578     __m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) );
    579 
    580     __m256i d0 = _mm256_unpacklo_epi32( mm5, p );
    581     __m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) );
    582     _mm_storeu_si128( (__m128i*)dst, _mm256_castsi256_si128( d1 ) );
    583     dst += 16;
    584 }
    585 #endif
    586 
    587 void CompressImageDxt1( const char* src, char* dst, int w, int h )
    588 {
    589     assert( (w % 4) == 0 && (h % 4) == 0 );
    590 
    591 #ifdef __AVX2__
    592     if( w%8 == 0 )
    593     {
    594         uint32_t buf[8*4];
    595         int i = 0;
    596 
    597         auto blocks = w * h / 32;
    598         do
    599         {
    600             auto tmp = (char*)buf;
    601             memcpy( tmp,        src,          8*4 );
    602             memcpy( tmp + 8*4,  src + w * 4,  8*4 );
    603             memcpy( tmp + 16*4, src + w * 8,  8*4 );
    604             memcpy( tmp + 24*4, src + w * 12, 8*4 );
    605             src += 8*4;
    606             if( ++i == w/8 )
    607             {
    608                 src += w * 3 * 4;
    609                 i = 0;
    610             }
    611 
    612             ProcessRGB_AVX( (uint8_t*)buf, dst );
    613         }
    614         while( --blocks );
    615     }
    616     else
    617 #endif
    618     {
    619         uint32_t buf[4*4];
    620         int i = 0;
    621 
    622         auto ptr = dst;
    623         auto blocks = w * h / 16;
    624         do
    625         {
    626             auto tmp = (char*)buf;
    627             memcpy( tmp,        src,          4*4 );
    628             memcpy( tmp + 4*4,  src + w * 4,  4*4 );
    629             memcpy( tmp + 8*4,  src + w * 8,  4*4 );
    630             memcpy( tmp + 12*4, src + w * 12, 4*4 );
    631             src += 4*4;
    632             if( ++i == w/4 )
    633             {
    634                 src += w * 3 * 4;
    635                 i = 0;
    636             }
    637 
    638             const auto c = ProcessRGB( (uint8_t*)buf );
    639             memcpy( ptr, &c, sizeof( uint64_t ) );
    640             ptr += sizeof( uint64_t );
    641         }
    642         while( --blocks );
    643     }
    644 }
    645 
    646 }