|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef UNICODE_ENCODING_H |
|
#define UNICODE_ENCODING_H |
|
|
|
#include <stdint.h> |
|
|
|
|
|
|
|
static inline int encodeUtf8(char *out, uint32_t code) { |
|
if (code < 0x80) { |
|
out[0] = code; |
|
return 1; |
|
} else if (code < 0x800) { |
|
out[0] = ((code >> 6) & 0x1F) | 0xC0; |
|
out[1] = ((code >> 0) & 0x3F) | 0x80; |
|
return 2; |
|
} else if (code < 0x10000) { |
|
if (code >= 0xD800 && code <= 0xDFFF) { |
|
|
|
|
|
return 0; |
|
} |
|
out[0] = ((code >> 12) & 0x0F) | 0xE0; |
|
out[1] = ((code >> 6) & 0x3F) | 0x80; |
|
out[2] = ((code >> 0) & 0x3F) | 0x80; |
|
return 3; |
|
} else if (code < 0x110000) { |
|
out[0] = ((code >> 18) & 0x07) | 0xF0; |
|
out[1] = ((code >> 12) & 0x3F) | 0x80; |
|
out[2] = ((code >> 6) & 0x3F) | 0x80; |
|
out[3] = ((code >> 0) & 0x3F) | 0x80; |
|
return 4; |
|
} else { |
|
|
|
return 0; |
|
} |
|
} |
|
|
|
|
|
|
|
static inline int encodeUtf16(wchar_t *out, uint32_t code) { |
|
if (code < 0x10000) { |
|
if (code >= 0xD800 && code <= 0xDFFF) { |
|
|
|
|
|
return 0; |
|
} |
|
out[0] = code; |
|
return 1; |
|
} else if (code < 0x110000) { |
|
code -= 0x10000; |
|
out[0] = 0xD800 | (code >> 10); |
|
out[1] = 0xDC00 | (code & 0x3FF); |
|
return 2; |
|
} else { |
|
|
|
return 0; |
|
} |
|
} |
|
|
|
|
|
|
|
static inline int utf8CharLength(char firstByte) { |
|
|
|
if ((firstByte & 0x80) == 0) { |
|
return 1; |
|
} else if ((firstByte & 0xE0) == 0xC0) { |
|
return 2; |
|
} else if ((firstByte & 0xF0) == 0xE0) { |
|
return 3; |
|
} else if ((firstByte & 0xF8) == 0xF0) { |
|
return 4; |
|
} else { |
|
|
|
return 0; |
|
} |
|
} |
|
|
|
|
|
|
|
static inline uint32_t decodeUtf8(const char *in) { |
|
const uint32_t kInvalid = static_cast<uint32_t>(-1); |
|
switch (utf8CharLength(in[0])) { |
|
case 1: { |
|
return in[0]; |
|
} |
|
case 2: { |
|
if ((in[1] & 0xC0) != 0x80) { |
|
return kInvalid; |
|
} |
|
uint32_t tmp = 0; |
|
tmp = (in[0] & 0x1F) << 6; |
|
tmp |= (in[1] & 0x3F); |
|
return tmp <= 0x7F ? kInvalid : tmp; |
|
} |
|
case 3: { |
|
if ((in[1] & 0xC0) != 0x80 || |
|
(in[2] & 0xC0) != 0x80) { |
|
return kInvalid; |
|
} |
|
uint32_t tmp = 0; |
|
tmp = (in[0] & 0x0F) << 12; |
|
tmp |= (in[1] & 0x3F) << 6; |
|
tmp |= (in[2] & 0x3F); |
|
if (tmp <= 0x07FF || (tmp >= 0xD800 && tmp <= 0xDFFF)) { |
|
return kInvalid; |
|
} else { |
|
return tmp; |
|
} |
|
} |
|
case 4: { |
|
if ((in[1] & 0xC0) != 0x80 || |
|
(in[2] & 0xC0) != 0x80 || |
|
(in[3] & 0xC0) != 0x80) { |
|
return kInvalid; |
|
} |
|
uint32_t tmp = 0; |
|
tmp = (in[0] & 0x07) << 18; |
|
tmp |= (in[1] & 0x3F) << 12; |
|
tmp |= (in[2] & 0x3F) << 6; |
|
tmp |= (in[3] & 0x3F); |
|
if (tmp <= 0xFFFF || tmp > 0x10FFFF) { |
|
return kInvalid; |
|
} else { |
|
return tmp; |
|
} |
|
} |
|
default: { |
|
return kInvalid; |
|
} |
|
} |
|
} |
|
|
|
static inline uint32_t decodeSurrogatePair(wchar_t ch1, wchar_t ch2) { |
|
return ((ch1 - 0xD800) << 10) + (ch2 - 0xDC00) + 0x10000; |
|
} |
|
|
|
#endif |
|
|