查看文章
 
UTF-8 与 UTF-16 相互转换及 \uhhhh 转换为 UTF-16 的 C++ 函数(上) 之函数篇
2008-07-17 11:27
////////////////////////////////////////////////////////////////////////////////
//
// ConvertUTF
//
// Description - Convert between UTF-8 & UTF-16, and convert \uhhhh to UTF-16
// Author - Li Ming Jie (Martin)
// Email -
// Created - 2008.03
// Licence - Free
// Copyright - Free
// Version -
// Changes -
// 2008.07.17 Add function convert_useq_to_UTF16 for converting \uhhhh
// sequence to UTF-16
//
////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <string.h>
#include <wchar.h>

unsigned long FirstByteMasks[4] = {0x00UL, 0xC0UL, 0xE0UL, 0xF0UL};

size_t convert_UTF8_to_UTF16(const char *src, wchar_t *dest);
size_t convert_UTF16_to_UTF8(const wchar_t *src, char *dest);
size_t convert_useq_to_UTF16(const char *src, wchar_t *dest);

size_t convert_UTF8_to_UTF16(const char *src, wchar_t *dest)
{
unsigned long c;
size_t extra_bytes;
size_t len = 0;

while(*src)
{
c = (unsigned long)*src++ & 0xFFUL;

if((c & 0x80UL) == 0UL) //1 octet
{
*dest++ = (wchar_t)c;
len++;
continue;
}
else if((c & 0xE0UL) == 0xC0UL) //2 octet
{
c -= 0xC0UL;
extra_bytes = 1;
}
else if((c & 0xF0UL) == 0xE0UL) // 3 octet
{
c -= 0xE0UL;
extra_bytes = 2;
}
else if((c & 0xF8UL) == 0xF0UL) // 4 octet
{
c -= 0xF0UL;
extra_bytes = 3;
}
else
{
//5 or 6 octets cannot be converted to UTF-16
return 0;
}

while(extra_bytes)
{
if(*src == 0) return 0; //unexpected end of string
if((*src & 0xC0UL) != 0x80UL) return 0; //illegal trailing byte

c <<= 6;
c += (unsigned long)*src++ & 0x3FUL;

extra_bytes--;
}

if(c < 0x10000UL)
{
//value between 0xD800 and 0xDFFF are preserved for UTF-16 pairs
if(c >= 0xD800UL && c <= 0xDFFFUL) return 0;

*dest++ = (wchar_t)c;
len++;
}
else
{
c -= 0x10000UL;

//value greater than 0x10FFFF, illegal UTF-16 value;
if(c >= 0x100000UL) return 0;

*dest++ = (wchar_t)(0xD800UL + (c >> 10));
*dest++ = (wchar_t)(0xDC00UL + (c & 0x3FFUL));

len += 2;
}
}

*dest = (wchar_t)'\0';
return len;
}

size_t convert_UTF16_to_UTF8(const wchar_t *src, char *dest)
{
unsigned long c;
size_t extra_bytes;
size_t len = 0;

while(*src)
{
c = (unsigned long)*src++ & 0xFFFFUL;

//convert UTF-16 literal to UCS-4
if(c >= 0xD800UL && c <= 0xDBFFUL)
{
c &= 0x03FFUL;
c <<= 10;

unsigned long c1 = (unsigned long)*src++ & 0xFFFFUL;

if(c1 >= 0xDC00UL && c1 <= 0xDFFFUL)
{
c += c1 & 0x03FFUL;
}
else
{
return 0;
}
}
else if(c >= 0xDC00UL && c <= 0xDFFFUL)
{
return 0;
}

//convert UCS-4 literal to UTF-8
if(c < 0x80UL) extra_bytes = 0;
else if(c < 0x800UL) extra_bytes = 1;
else if(c < 0x10000UL) extra_bytes = 2;
else extra_bytes = 3;

dest += extra_bytes;

switch(extra_bytes)
{
case 3 : *dest-- = (char)((c | 0x80UL) & 0xBFUL); c >>= 6;
case 2 : *dest-- = (char)((c | 0x80UL) & 0xBFUL); c >>= 6;
case 1 : *dest-- = (char)((c | 0x80UL) & 0xBFUL); c >>= 6;
case 0 : *dest = (char)(c | FirstByteMasks[extra_bytes]);
}

extra_bytes++;
dest += extra_bytes;
len += extra_bytes;
}

*dest = '\0';
return len;
}

size_t convert_useq_to_UTF16(const char *src, wchar_t *dest)
{
int i = 0, j;
size_t len = strlen(src);

unsigned int hex;
size_t size = 0;

while(1)
{
//Test if sequence start with '\'
//Also test if sequence ends
if(src[i++] != '\\') break;

//Test if enough bytes exist
if(len - i < 5) break;

//Test sequence type, 'u'
if(src[i++] != 'u') break;

//process hhhh
dest[size] = 0;
for(j = 0; j < 4; j++)
{
if(src[i] >= '0' && src[i] <= '9')
{
hex = src[i] - '0';
}
else if(src[i] >= 'a' && src[i] <= 'f')
{
hex = src[i] - 'a' + 10;
}
else if(src[i] >= 'A' && src[i] <= 'F')
{
hex = src[i] - 'A' + 10;
}
else
{
break;
}

dest[size] <<= 4;
dest[size] += hex;
i++;
}

size++;
}

dest[size] = '\0';
return size;
}

类别:程序设计||添加到搜藏 |分享到i贴吧|浏览(1127)|评论 (0)
 
 
最近读者:
 
网友评论:
发表评论:
姓 名:
网址或邮箱: (选填)
内 容:
     

   
帮助中心 | 空间客服 | 投诉中心 | 空间协议
©2012 Baidu