lutf8lib.cc (7075B)
1 /* 2 ** $Id: lutf8lib.c,v 1.16.1.1 2017/04/19 17:29:57 roberto Exp $ 3 ** Standard library for UTF-8 manipulation 4 ** See Copyright Notice in lua.h 5 */ 6 7 #define lutf8lib_c 8 #define LUA_LIB 9 10 #include "lprefix.h" 11 12 13 #include <assert.h> 14 #include <limits.h> 15 #include <stdlib.h> 16 #include <string.h> 17 18 #include "lua.h" 19 20 #include "lauxlib.h" 21 #include "lualib.h" 22 23 #define MAXUNICODE 0x10FFFF 24 25 #define iscont(p) ((*(p) & 0xC0) == 0x80) 26 27 28 /* from strlib */ 29 /* translate a relative string position: negative means back from end */ 30 static lua_Integer u_posrelat (lua_Integer pos, size_t len) { 31 if (pos >= 0) return pos; 32 else if (0u - (size_t)pos > len) return 0; 33 else return (lua_Integer)len + pos + 1; 34 } 35 36 37 /* 38 ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. 39 */ 40 static const char *utf8_decode (const char *o, int *val) { 41 static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF}; 42 const unsigned char *s = (const unsigned char *)o; 43 unsigned int c = s[0]; 44 unsigned int res = 0; /* final result */ 45 if (c < 0x80) /* ascii? */ 46 res = c; 47 else { 48 int count = 0; /* to count number of continuation bytes */ 49 while (c & 0x40) { /* still have continuation bytes? */ 50 int cc = s[++count]; /* read next byte */ 51 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ 52 return NULL; /* invalid byte sequence */ 53 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ 54 c <<= 1; /* to test next bit */ 55 } 56 res |= ((c & 0x7F) << (count * 5)); /* add first byte */ 57 if (count > 3 || res > MAXUNICODE || res <= limits[count]) 58 return NULL; /* invalid byte sequence */ 59 s += count; /* skip continuation bytes read */ 60 } 61 if (val) *val = res; 62 return (const char *)s + 1; /* +1 to include first byte */ 63 } 64 65 66 /* 67 ** utf8len(s [, i [, j]]) --> number of characters that start in the 68 ** range [i,j], or nil + current position if 's' is not well formed in 69 ** that interval 70 */ 71 static int utflen (lua_State *L) { 72 int n = 0; 73 size_t len; 74 const char *s = luaL_checklstring(L, 1, &len); 75 lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); 76 lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len); 77 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, 78 "initial position out of string"); 79 luaL_argcheck(L, --posj < (lua_Integer)len, 3, 80 "final position out of string"); 81 while (posi <= posj) { 82 const char *s1 = utf8_decode(s + posi, NULL); 83 if (s1 == NULL) { /* conversion error? */ 84 lua_pushnil(L); /* return nil ... */ 85 lua_pushinteger(L, posi + 1); /* ... and current position */ 86 return 2; 87 } 88 posi = s1 - s; 89 n++; 90 } 91 lua_pushinteger(L, n); 92 return 1; 93 } 94 95 96 /* 97 ** codepoint(s, [i, [j]]) -> returns codepoints for all characters 98 ** that start in the range [i,j] 99 */ 100 static int codepoint (lua_State *L) { 101 size_t len; 102 const char *s = luaL_checklstring(L, 1, &len); 103 lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); 104 lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len); 105 int n; 106 const char *se; 107 luaL_argcheck(L, posi >= 1, 2, "out of range"); 108 luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range"); 109 if (posi > pose) return 0; /* empty interval; return no values */ 110 if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ 111 return luaL_error(L, "string slice too long"); 112 n = (int)(pose - posi) + 1; 113 luaL_checkstack(L, n, "string slice too long"); 114 n = 0; 115 se = s + pose; 116 for (s += posi - 1; s < se;) { 117 int code; 118 s = utf8_decode(s, &code); 119 if (s == NULL) 120 return luaL_error(L, "invalid UTF-8 code"); 121 lua_pushinteger(L, code); 122 n++; 123 } 124 return n; 125 } 126 127 128 static void pushutfchar (lua_State *L, int arg) { 129 lua_Integer code = luaL_checkinteger(L, arg); 130 luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range"); 131 lua_pushfstring(L, "%U", (long)code); 132 } 133 134 135 /* 136 ** utfchar(n1, n2, ...) -> char(n1)..char(n2)... 137 */ 138 static int utfchar (lua_State *L) { 139 int n = lua_gettop(L); /* number of arguments */ 140 if (n == 1) /* optimize common case of single char */ 141 pushutfchar(L, 1); 142 else { 143 int i; 144 luaL_Buffer b; 145 luaL_buffinit(L, &b); 146 for (i = 1; i <= n; i++) { 147 pushutfchar(L, i); 148 luaL_addvalue(&b); 149 } 150 luaL_pushresult(&b); 151 } 152 return 1; 153 } 154 155 156 /* 157 ** offset(s, n, [i]) -> index where n-th character counting from 158 ** position 'i' starts; 0 means character at 'i'. 159 */ 160 static int byteoffset (lua_State *L) { 161 size_t len; 162 const char *s = luaL_checklstring(L, 1, &len); 163 lua_Integer n = luaL_checkinteger(L, 2); 164 lua_Integer posi = (n >= 0) ? 1 : len + 1; 165 posi = u_posrelat(luaL_optinteger(L, 3, posi), len); 166 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, 167 "position out of range"); 168 if (n == 0) { 169 /* find beginning of current byte sequence */ 170 while (posi > 0 && iscont(s + posi)) posi--; 171 } 172 else { 173 if (iscont(s + posi)) 174 return luaL_error(L, "initial position is a continuation byte"); 175 if (n < 0) { 176 while (n < 0 && posi > 0) { /* move back */ 177 do { /* find beginning of previous character */ 178 posi--; 179 } while (posi > 0 && iscont(s + posi)); 180 n++; 181 } 182 } 183 else { 184 n--; /* do not move for 1st character */ 185 while (n > 0 && posi < (lua_Integer)len) { 186 do { /* find beginning of next character */ 187 posi++; 188 } while (iscont(s + posi)); /* (cannot pass final '\0') */ 189 n--; 190 } 191 } 192 } 193 if (n == 0) /* did it find given character? */ 194 lua_pushinteger(L, posi + 1); 195 else /* no such character */ 196 lua_pushnil(L); 197 return 1; 198 } 199 200 201 static int iter_aux (lua_State *L) { 202 size_t len; 203 const char *s = luaL_checklstring(L, 1, &len); 204 lua_Integer n = lua_tointeger(L, 2) - 1; 205 if (n < 0) /* first iteration? */ 206 n = 0; /* start from here */ 207 else if (n < (lua_Integer)len) { 208 n++; /* skip current byte */ 209 while (iscont(s + n)) n++; /* and its continuations */ 210 } 211 if (n >= (lua_Integer)len) 212 return 0; /* no more codepoints */ 213 else { 214 int code; 215 const char *next = utf8_decode(s + n, &code); 216 if (next == NULL || iscont(next)) 217 return luaL_error(L, "invalid UTF-8 code"); 218 lua_pushinteger(L, n + 1); 219 lua_pushinteger(L, code); 220 return 2; 221 } 222 } 223 224 225 static int iter_codes (lua_State *L) { 226 luaL_checkstring(L, 1); 227 lua_pushcfunction(L, iter_aux); 228 lua_pushvalue(L, 1); 229 lua_pushinteger(L, 0); 230 return 3; 231 } 232 233 234 /* pattern to match a single UTF-8 character */ 235 #define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*" 236 237 238 static const luaL_Reg funcs[] = { 239 {"offset", byteoffset}, 240 {"codepoint", codepoint}, 241 {"char", utfchar}, 242 {"len", utflen}, 243 {"codes", iter_codes}, 244 /* placeholders */ 245 {"charpattern", NULL}, 246 {NULL, NULL} 247 }; 248 249 250 LUAMOD_API int luaopen_utf8 (lua_State *L) { 251 luaL_newlib(L, funcs); 252 lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1); 253 lua_setfield(L, -2, "charpattern"); 254 return 1; 255 } 256