////////////////////////////////////////////////////////////////////////////////
//
// ConvertUTF
//
// Description - Convert between UTF-8 & UTF-16, and convert \uhhhh to UTF-16
// Author - Li Ming Jie (Martin)
// Email -
// Created - 2008.03
// Licence - Free
// Copyright - Free
// Version -
// Changes -
// 2008.07.17 Add function convert_useq_to_UTF16 for converting \uhhhh
// sequence to UTF-16
//
////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <string.h>
#include <wchar.h>
unsigned long FirstByteMasks[4] = {0x00UL, 0xC0UL, 0xE0UL, 0xF0UL};
size_t convert_UTF8_to_UTF16(const char *src, wchar_t *dest);
size_t convert_UTF16_to_UTF8(const wchar_t *src, char *dest);
size_t convert_useq_to_UTF16(const char *src, wchar_t *dest);
size_t convert_UTF8_to_UTF16(const char *src, wchar_t *dest)
{
unsigned long c;
size_t extra_bytes;
size_t len = 0;
while(*src)
{
c = (unsigned long)*src++ & 0xFFUL;
if((c & 0x80UL) == 0UL) //1 octet
{
*dest++ = (wchar_t)c;
len++;
continue;
}
else if((c & 0xE0UL) == 0xC0UL) //2 octet
{
c -= 0xC0UL;
extra_bytes = 1;
}
else if((c & 0xF0UL) == 0xE0UL) // 3 octet
{
c -= 0xE0UL;
extra_bytes = 2;
}
else if((c & 0xF8UL) == 0xF0UL) // 4 octet
{
c -= 0xF0UL;
extra_bytes = 3;
}
else
{
//5 or 6 octets cannot be converted to UTF-16
return 0;
}
while(extra_bytes)
{
if(*src == 0) return 0; //unexpected end of string
if((*src & 0xC0UL) != 0x80UL) return 0; //illegal trailing byte
c <<= 6;
c += (unsigned long)*src++ & 0x3FUL;
extra_bytes--;
}
if(c < 0x10000UL)
{
//value between 0xD800 and 0xDFFF are preserved for UTF-16 pairs
if(c >= 0xD800UL && c <= 0xDFFFUL) return 0;
*dest++ = (wchar_t)c;
len++;
}
else
{
c -= 0x10000UL;
//value greater than 0x10FFFF, illegal UTF-16 value;
if(c >= 0x100000UL) return 0;
*dest++ = (wchar_t)(0xD800UL + (c >> 10));
*dest++ = (wchar_t)(0xDC00UL + (c & 0x3FFUL));
len += 2;
}
}
*dest = (wchar_t)'\0';
return len;
}
size_t convert_UTF16_to_UTF8(const wchar_t *src, char *dest)
{
unsigned long c;
size_t extra_bytes;
size_t len = 0;
while(*src)
{
c = (unsigned long)*src++ & 0xFFFFUL;
//convert UTF-16 literal to UCS-4
if(c >= 0xD800UL && c <= 0xDBFFUL)
{
c &= 0x03FFUL;
c <<= 10;
unsigned long c1 = (unsigned long)*src++ & 0xFFFFUL;
if(c1 >= 0xDC00UL && c1 <= 0xDFFFUL)
{
c += c1 & 0x03FFUL;
}
else
{
return 0;
}
}
else if(c >= 0xDC00UL && c <= 0xDFFFUL)
{
return 0;
}
//convert UCS-4 literal to UTF-8
if(c < 0x80UL) extra_bytes = 0;
else if(c < 0x800UL) extra_bytes = 1;
else if(c < 0x10000UL) extra_bytes = 2;
else extra_bytes = 3;
dest += extra_bytes;
switch(extra_bytes)
{
case 3 : *dest-- = (char)((c | 0x80UL) & 0xBFUL); c >>= 6;
case 2 : *dest-- = (char)((c | 0x80UL) & 0xBFUL); c >>= 6;
case 1 : *dest-- = (char)((c | 0x80UL) & 0xBFUL); c >>= 6;
case 0 : *dest = (char)(c | FirstByteMasks[extra_bytes]);
}
extra_bytes++;
dest += extra_bytes;
len += extra_bytes;
}
*dest = '\0';
return len;
}
size_t convert_useq_to_UTF16(const char *src, wchar_t *dest)
{
int i = 0, j;
size_t len = strlen(src);
unsigned int hex;
size_t size = 0;
while(1)
{
//Test if sequence start with '\'
//Also test if sequence ends
if(src[i++] != '\\') break;
//Test if enough bytes exist
if(len - i < 5) break;
//Test sequence type, 'u'
if(src[i++] != 'u') break;
//process hhhh
dest[size] = 0;
for(j = 0; j < 4; j++)
{
if(src[i] >= '0' && src[i] <= '9')
{
hex = src[i] - '0';
}
else if(src[i] >= 'a' && src[i] <= 'f')
{
hex = src[i] - 'a' + 10;
}
else if(src[i] >= 'A' && src[i] <= 'F')
{
hex = src[i] - 'A' + 10;
}
else
{
break;
}
dest[size] <<= 4;
dest[size] += hex;
i++;
}
size++;
}
dest[size] = '\0';
return size;
}