话题2512093的标题是: gb2312->utf-8或gb5或其他编码,如何? (20分)
分类:数据库-C/S型 xx-xzh (2004-03-20 11:09:00)
不同编码如何实现互相转换?[8D]
低调一贱男 (2004-03-20 11:19:01)
我现在作java,经常接触utf8.呵呵,做了一个.给你,不过好像没有注释
unit main;
interface
uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, ExtCtrls, StdCtrls, Buttons;
type
TUnicode2GB = class(TForm)
GB: TMemo;
Panel1: TPanel;
SetHeight: TSplitter;
Unicode: TMemo;
BitBtn1: TBitBtn;
BitBtn2: TBitBtn;
ComboBox1: TComboBox;
Label1: TLabel;
procedure BitBtn1Click(Sender: TObject);
procedure BitBtn2Click(Sender: TObject);
private
{ Private declarations }
public
{ Public declarations }
end;
var
Unicode2GB: TUnicode2GB;
function AnsiToUnicode(Ansi: string): string;
function Unicode2AscII(s: string): string;
function AnsiToUTF8(Ansi: string): string;
implementation
{$R *.dfm}
procedure TUnicode2GB.BitBtn1Click(Sender: TObject);
begin
Close;
end;
procedure TUnicode2GB.BitBtn2Click(Sender: TObject);
var s: string;
begin
s := GB.Lines.Text;
Unicode.Lines.Clear;
if ComboBox1.Items[ComboBox1.ItemIndex] = 'UTF8' then
Unicode.Lines.append(AnsiToUTF8(s))
else if ComboBox1.Items[ComboBox1.ItemIndex] = 'Unicode' then
Unicode.Lines.append(AnsiToUnicode(s));
Unicode.Lines.append('GB2312码: ' + GB.Lines.Text);
end;
function AnsiToUTF8(Ansi: string): string;
var
s: string;
i: integer;
j, k: string[2];
a: array[1..1000] of char;
begin
s := '';
StringToWideChar(Ansi, @(a[1]), 500);
i := 1;
while ((a
<> #0) or (a[i + 1] <> #0)) do begin
j := IntToHex(Integer(a), 2);
k := IntToHex(Integer(a[i + 1]), 2);
s := s + '/u' + k + j;
i := i + 2;
end;
Result := s;
end;
function AnsiToUnicode(Ansi: string): string;
var
s: string;
i: integer;
j, k: string;
a: array[1..1000] of char;
begin
s := '';
StringToWideChar(Ansi, @(a[1]), 500);
i := 1;
while ((a <> #0) or (a[i + 1] <> #0)) do begin
j := inttostr(Integer(a));
k := inttostr(Integer(a[i + 1]));
// showmessage(j+' '+k);
s := s + '/u' + k + j;
i := i + 2;
end;
Result := s;
end;
end.
xx-xzh (2004-03-20 17:43:28)
感谢 低调一贱男!
另:还有 7-bit ASCII串
Binary
UCS2
GB2312
UDHI Indicator
最好能告诉原理,不然总麻烦你也不好,也总不会!
走来走去 (2004-03-20 22:57:27)
C++Builder的例子
==================================
1、 英文编码
缺省的GSM字符集为7位编码,ASCII码为8位编码,编码就是将8位ASCII编码转换为7位编码。
例如:1234 编码后得到31D98C06
2进制表示
8位编码 00110001 00110010 00110011 00110100
7位编码 00110001 11011001 10001100 00000110
通过例子可以看出,将ascii8位编码的Bit8去掉,依次将下7位编码的后几位逐次移到前面,形成新的8位编码。
以下是C++Builder的实现代码:
String __stdcall EncodeEnglish(String InputStr)
{
int n,len,cur;
String tempstr,returnstr;
unsigned char mid1[2],mid2[2];
len=InputStr.Length();
n=0;
for(int i=1;i<=len;i++)
{
if (i<len)
{
strcpy(mid1,InputStr.SubString(i,1).c_str());
strcpy(mid2,InputStr.SubString(i+1,1).c_str());
cur=(mid1[0]>>n)|((mid2[0]<<(7-n))& 0xff);
}
else
{
strcpy(mid1,InputStr.SubString(i,1).c_str());
cur=(mid1[0]>>n)& 0x7f;
}
FmtStr(tempstr,"%2.2X",ARRAYOFCONST((cur)));
returnstr=returnstr+tempstr;
n=(n+1)%7;
if (n==0)
i++;
}
return returnstr;
}
2、 英文解码
简单地说就是将7位字符编码转换为8为字符编码
以下是C++Builder的实现代码:
int ReturnHex(int Value)
{
switch (Value)
{
case 0:
Value=0x7f;
break;
case 1:
Value=0x3f;
break;
case 2:
Value=0x1f;
break;
case 3:
Value=0x0f;
break;
case 4:
Value=0x07;
break;
case 5:
Value=0x03;
break;
case 6:
Value=0x01;
break;
case 7:
Value=0x00;
break;
}
return Value;
}
String __stdcall DecodeEnglish (String InputStr)
{
unsigned char InStr[300];
char OutStr[300];
String str;
int j=0,i=0;
int Point=0;
int temp;
memset(InStr,0,301);
memset(OutStr,0,301);
for(int i=0;i<InputStr.Length();i=i+2)
{
str="0x"+InputStr.SubString(i+1,2);
InStr[i/2]=StrToInt(str);
}
while(j<=InputStr.Length()/2)
{
if(Point==0)
OutStr=InStr[j]&ReturnHex(Point);
else
OutStr=((InStr[j]&ReturnHex(Point))<<Point)|(InStr[j-1]>>(8-Point));
if(Point%7==0&&Point!=0)
Point=0;
else
Point=Point+1;
i++;
j=i-(i/8);
}
OutStr[12]=((InStr[12]&0x07)<<5)|(InStr[11]>>(8-5));
return AnsiString(OutStr);
}
3、 中文编码
中文编码较为简单,就是将GB2312的中文编码转换为代码页为CP936的Unicode编码即可
以下是C++Builder的实现代码
String EncodeChinese(String InputStr)
{
int cur;
String tempstr,returnstr;
WideString ws;
wchar_t mid[2];
ws=WideString(InputStr);
for(int i=1;i<=ws.Length();i++)
{
wcscpy(mid,ws.SubString(i,1).c_bstr());
cur=mid[0];
FmtStr(tempstr,"%4.4X",ARRAYOFCONST((cur)));
returnstr=returnstr+tempstr;
}
return returnstr;
}
4、 中文解码
将代码页为CP936的Unicode编码转换为GB2312的中文编码即可
以下是C++Builder的实现代码
String DecodeChinese(String InputStr)
{
wchar_t Buf[300];
for(int i=0;i<InputStr.Length();i=i+4)
{
Buf[i/4]=StrToInt("0x"+InputStr.SubString(i+1,4));
}
Buf[InputStr.Length()/4]=0;
return WideCharToString(Buf);
}
走来走去 (2004-03-20 22:58:24)
unix 下的c
========================
说明:函数名称
int strUnicode2GB(char *strSourcer, char *strDest,int n)
转换Unicde字符串到GB码,返回汉字数
输入( Unicode 源串,GB2312/ASCII混合编码串,Unicode字节数 必须是偶数!!!)
#include <sys/types.h>
const unsigned short int Unicode_GB2312[][2] =
{
/*Unicode ,GB||ASCII ,MEMO*/
0x0000,0x0040,/* 0 COMMERCIAL AT @ */
0x0001,0x00A3,/* 1 POUND SIGN £ */
0x0002,0x0024,/* 2 DOLLAR SIGN $ */
0x0003,0x00A5,/* 3 YEN SIGN ¥ */
0x0004,0x00E8,/* 4 LATIN SMALL LETTER E WITH GRAVE è */
.............
.............
码表太长了 请尽量通过 Google 查询,多的是
或者通过 stevenkoh@sohu.com 向我索取
0xFF5C,0xA3FC,/* '|' -> 65372 */
0xFF5D,0xA3FD,/* '}' -> 65373 */
0xFF5E,0xA1AB,/* '~' -> 65374 */
0xFFE0,0xA1E9,/* '¢' -> 65504 */
0xFFE1,0xA1EA,/* '£' -> 65505 */
0xFFE3,0xA3FE,/* ' ̄' -> 65507 */
0xFFE5,0xA3A4,/* '¥' -> 65509 */
};
u_int16_t Unicode2GBcode(u_int16_t iUnicode)
{
int i,j,n;
switch (iUnicode){
case 0x0002:
return 0x24;
break;
case 0x000a:
return 0xa;
break;
case 0x000d:
return 0xd;
break;
case 0x0040:
return 0xA1;
break;
}
if ((iUnicode>=0x20&&iUnicode<=0x5a)||(iUnicode>=0x61&&iUnicode<=0x7a)) return iUnicode;
for (i=0,j=0,n=sizeof(Unicode_GB2312)/sizeof(Unicode_GB2312[0])-1;n>0;n>>=1,++j){
if(Unicode_GB2312[0]==iUnicode) return Unicode_GB2312[1];
if (j>1){
if(Unicode_GB2312[i-1][0]==iUnicode) return Unicode_GB2312[i-1][1];
if(Unicode_GB2312[i+1][0]==iUnicode) return Unicode_GB2312[i+1][1];
}
if (Unicode_GB2312[0]<iUnicode) i=i+n;
else i=i-n;
}
if(Unicode_GB2312[0]==iUnicode) return Unicode_GB2312[1];
if(Unicode_GB2312[i-1][0]==iUnicode) return Unicode_GB2312[i-1][1];
if(Unicode_GB2312[i+1][0]==iUnicode) return Unicode_GB2312[i+1][1];
return 0; //转换不成功
}
/*转换Unicde字符串到GB码,返回汉字数*/
int strUnicode2GB(const char *strSourcer,const char *strDest,int n)
{
char cTmp;
u_int16_t hz,tmphz;
char *pSrc;
char *pDest;
int i;
for (i=0,pSrc=strSourcer,pDest=strDest;n>0;n-=2,pSrc+=2,++i,++pDest){
hz=0;
hz=*pSrc<<8|(*(pSrc+1)&0x00FF);
tmphz=Unicode2GBcode(hz);
if (!tmphz||tmphz>0x7F&&tmphz<0xFF){
*pDest='.';
continue;
}
else if (tmphz>0x00&&tmphz<=0x7F){
cTmp=tmphz;
*pDest=cTmp;
}
else{
cTmp=tmphz;
*pDest=(tmphz>>=8);
++pDest;
*pDest=cTmp;
}
}
*pDest='/0';
return i;
}
走来走去 (2004-03-20 22:59:03)
C#.Net 里面实现字符的编码和解码——
#region 生成字符(含英文)引用码
/// <summary>
/// 生成unicode实体引用的wap字符串,所有汉字均由此转换
/// </summary>
/// <param name="s_Chinese">中文字符串,可夹英文</param>
/// <returns></returns>
protected string Gen_Unicode(string s_Chinese)
{
string s_retu="";
char[] c_chars=s_Chinese.ToCharArray();
for(int i=0;i<c_chars.Length;i++)
{
s_retu+="&#x" + ((short)c_chars).ToString("X"+ ";";
}
return s_retu;
}
/// <summary>
/// 把 WAP 代码转换为字符(中文字符串,可夹英文)
/// </summary>
protected string Gen_Chinese(string s_Unicode)
{
string s_retu="";
string s1="";
string s2="";
byte[] array = new byte[2];
char []spar={';'};
string []chinese = s_Unicode.Split(spar);
int count=chinese.Length;
if(count>0)
{
for(int i=0;i<count;i++)
{
string s_tmp=chinese.Trim();
if(s_tmp!=""
{
// s_tmp=s_tmp.Substring(4);
if(s_tmp.Length>=4)
{
s1 = s_tmp.Substring(0,2);
s2 = s_tmp.Substring(2,2);
array[0] = (byte)Convert.ToInt32(s1,16);
array[1] = (byte)Convert.ToInt32(s2,16);
s_retu =s_retu + System.Text.Encoding.BigEndianUnicode.GetString(array);
}
else //英文的
{
// s1 = s_tmp.Substring(0,2);
s1 = "00";
s2 = s_tmp;
array[0] = (byte)Convert.ToInt32(s1,16);
array[1] = (byte)Convert.ToInt32(s2,16);
s_retu =s_retu + System.Text.Encoding.BigEndianUnicode.GetString(array);
}
}
}
}
return s_retu;
}
/// <summary>
/// 把字符转换为双字节的 hex
/// </summary>
/// <param name="s_Chinese">中文字符串,可夹英文</param>
protected string Gen_Hex(string s_Chinese)
{
string s_retu="";
char[] c_chars=s_Chinese.ToCharArray();
for(int i=0;i<c_chars.Length;i++)
{
string hex=((short)c_chars).ToString("X"
if(hex.Length==2)//如果是单字节的则转换为双字节的字符
hex = "00" + hex;
s_retu+=hex;
}
return s_retu;
}
/// <summary>
/// 把 hex 代码转换为字符(中文字符串,可夹英文)
/// </summary>
protected string Gen_CharFromCode(string code)
{
string s = "";
for(int i=0;i<code.Length;i+=4)
{
string s1 = code.Substring(i,2);
string s2 = code.Substring(i+2,2);
int t1 = Convert.ToInt32(s1,16);
int t2 = Convert.ToInt32(s2,16);
byte[] array = new byte[2];
array[0] = (byte)t1;
array[1] = (byte)t2;
s += System.Text.Encoding.BigEndianUnicode.GetString(array);
}
return s;
}
#endregion
走来走去 (2004-03-20 22:59:29)
c++的编码、解码
#include <string.h>
#include <stdio.h>
#define BYTE unsigned char
void code( BYTE *p, int len, BYTE *q )
{
int i,j;
for( i=j=0; i<len; i++ )
{
if( i%8 == 7 )
continue;
char next = (i<len)?p[i+1]:0;
q[j++] = (p>>i%8) + ((next&((1<<(i%8+1))-1))<<8-i%8-1);
}
q[j]=0;
}
void dcode( BYTE *p, int len, BYTE *q )
{
int i, j;
BYTE temp = 0;
for(i=j=0;i<len;i++)
{
q[j++] = ((p&((1<<8-i%7-1)-1))<<i%7)+temp;
temp=p>>(8-i%7-1);
if( i%7 == 6 )
{
q[j++] = temp;
temp = 0;
}
}
q[j] = 0;
}
void main()
{
char p[] = "234312l;asdkfpqwrjasdfjqwjrlashfnoqwerhkasdfqwerqwerqwerqwer";
char q[256];
char r[256];
code( (BYTE *)p, strlen(p), (BYTE *)q );
dcode( (BYTE *)q, strlen(q), (BYTE *)r );
printf( "%s/n", r );
}
走来走去 (2004-03-20 23:00:24)
delphi解码
=================
function UnicodeToString(var AString: string; AUnicode: PChar; ALenth: integer): integer;
{**********************************
功能:网络unicode转gb
AString,输出字符串
AUnicode:输入数组,接收到的直接传进来,不用高低互换了
ALenth:长度
***********************************}
var
TmpBuf: array [1..1024] of char;
TmpChar: char;
TmpLen, i: integer;
begin
try
TmpLen := ALenth div 2;
if ALenth > 0 then
begin
CopyMemory(@TmpBuf, AUnicode, ALenth);
for i := 0 to TmpLen - 1 do
begin
TmpChar := (AUnicode + i * 2)^;
(AUnicode + i * 2)^ := (AUnicode + i * 2 + 1)^;
(AUnicode + i * 2 + 1)^ := TmpChar;
end;
TmpBuf[TmpLen * 2 + 1] := #0;
TmpBuf[TmpLen * 2 + 2] := #0;
AString := WideCharToString(pwidechar(AUnicode));
Result := TmpLen;
end
else
begin
Result := 0;
end;
except
AString := '';
Result := 0;
end;
end;
走来走去 (2004-03-20 23:01:00)
java解码
==============
/*
* UnicodeTest.java
*
* Created on July 29, 2003, 12:59 PM
*/
/**
*
* @author abc
* @version
*/
public class UnicodeTest
{
public static void main(String args[])
{
UnicodeTest UT = new UnicodeTest();
UT.test1();
}
public void test1()
{
String str = "测试信息abc123";
try
{
byte[] b = str.getBytes("GBK"
System.out.println(str + " -(GBK)编码: " + bytesToHexStr(b));
System.out.println(""
str = new String(b, "GBK"
System.out.println("从GBK编码 " + bytesToHexStr(b) + " 重新转换为字串: " + str);
System.out.println(""
b = str.getBytes("UnicodeBigUnmarked"
System.out.println(str + " -(UCS2)编码: " + bytesToHexStr(b));
System.out.println(""
str = new String(b, "UnicodeBigUnmarked"
System.out.println("从(UCS2)编码 " + bytesToHexStr(b) + " 重新转换为字串: " + str);
System.out.println(""
b = str.getBytes("ASCII"
System.out.println(str + " -(ASCII)编码: " + bytesToHexStr(b));
System.out.println(""
}
catch(Exception e){}
}
private String bytesToHexStr(byte[] b)
{
if (b == null) return "";
StringBuffer strBuffer = new StringBuffer(b.length * 3);
for(int i = 0; i < b.length; i++)
{
strBuffer.append(Integer.toHexString(b & 0xff));
strBuffer.append(" "
}
return strBuffer.toString();
}
}
运行此小程序的输出结果是:
测试信息abc123 -(GBK)编码: b2 e2 ca d4 d0 c5 cf a2 61 62 63 31 32 33
从GBK编码 b2 e2 ca d4 d0 c5 cf a2 61 62 63 31 32 33 重新转换为字串: 测试信息abc123
测试信息abc123 -(UCS2)编码: 6d 4b 8b d5 4f e1 60 6f 0 61 0 62 0 63 0 31 0 32 0 33
从(UCS2)编码 6d 4b 8b d5 4f e1 60 6f 0 61 0 62 0 63 0 31 0 32 0 33 重新转换为字串: 测试信息abc123
测试信息abc123 -(ASCII)编码: 3f 3f 3f 3f 61 62 63 31 32 33