utf8.h (1682B)
1 #pragma once 2 3 #include "intrinsics.h" 4 5 // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> 6 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. 7 8 #define UTF8_ACCEPT 0 9 #define UTF8_REJECT 12 10 11 uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) { 12 static const uint8_t utf8d[] = { 13 // The first part of the table maps bytes to character classes that 14 // to reduce the size of the transition table and create bitmasks. 15 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 16 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 17 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 19 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 20 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 21 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 22 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 23 24 // The second part is a transition table that maps a combination 25 // of a state of the automaton and a character class to a state. 26 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 27 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 28 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 29 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 30 12,36,12,12,12,12,12,12,12,12,12,12, 31 }; 32 33 uint32_t type = utf8d[byte]; 34 35 *codep = (*state != UTF8_ACCEPT) ? 36 (byte & 0x3fu) | (*codep << 6) : 37 (0xff >> type) & (byte); 38 39 *state = utf8d[256 + *state + type]; 40 return *state; 41 }