簡體中文有兩種常用編碼:GB2312 和 Unicode. 它們的對應關系可以用
下面的程式所生成:
/** * GB2312Unicde.java * Copyright (c) 2003 by Dr. Herong Yang, http://www.herongyang.com/ */ import java.io.*; import java.nio.*; import Java.nio.charset.*; class GB2312Unicde { static OutputStream out = null; static char hexDigit[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; static int b_out[] = {201,267,279,293,484,587,625,657,734,782,827, 874,901,980,5590}; static int e_out[] = {216,268,280,294,494,594,632,694,748,794,836, 894,903,994,5594}; public static void main(String[] args) { try { out = new FileOutputStream("gb2312.gb"); writeCode(); out.close(); } catch (IOException e) { System.out.println(e.toString()); } } public static void writeCode() throws IOException { boolean reserved = false; String name = null; // GB2312 is not supported by JDK. So I am using GBK. CharsetDecoder gbdc = Charset.forName("GBK").newDecoder(); CharsetEncoder uxec = Charset.forName("UTF-16BE").newEncoder(); CharsetEncoder u8ec = Charset.forName("UTF-8").newEncoder(); ByteBuffer gbbb = null; ByteBuffer uxbb = null; ByteBuffer u8bb = null; CharBuffer cb = null; int count = 0; for (int i=1; i<=94; i++) { // Defining row settings if (i>=1 && i<=9) { reserved = false; name = "Graphic symbols"; } else if (i>=10 && i<=15) { reserved = true; name = "Reserved"; } else if (i>=16 && i<=55) { reserved = false; name = "Level 1 characters"; } else if (i>=56 && i<=87) { reserved = false; name = "Level 2 characters"; } else if (i>=88 && i<=94) { reserved = true; name = "Reserved"; } // writing row title writeln(); writeString("<p>"); writeNumber(i); writeString(" Row: "+name); writeln(); writeString("</p>"); writeln(); if (!reserved) { writeln(); writeHeader(); // looping through all characters in one row for (int j=1; j<=94; j++) { byte hi = (byte)(0xA0 + i); byte lo = (byte)(0xA0 + j); if (validGB(i,j)) { // getting GB, UTF-16BE, UTF-8 codes gbbb = ByteBuffer.wrap(new byte[]{hi,lo}); try { cb = gbdc.decode(gbbb); uxbb = uxec.encode(cb); cb.rewind(); u8bb = u8ec.encode(cb); } catch (CharacterCodingException e) { cb = null; uxbb = null; u8bb = null; } } else { cb = null; uxbb = null; u8bb = null; } writeNumber(i); writeNumber(j); writeString(" "); if (cb!=null) { writeByte(hi); writeByte(lo); writeString(" "); writeHex(hi); writeHex(lo); count++; } else { writeGBSpace(); writeString(" null"); } writeString(" "); writeByteBuffer(uxbb,2); writeString(" "); writeByteBuffer(u8bb,3); if (j%2 == 0) { writeln(); } else { writeString(" "); } } writeFooter(); } } System.out.println("Number of GB characters worte: "+count); } public static void writeln() throws IOException { out.write(0x0D); out.write(0x0A); } public static void writeByte(byte b) throws IOException { out.write(b & 0xFF); } public static void writeByteBuffer(ByteBuffer b, int l) throws IOException { int i = 0; if (b==null) { writeString("null"); i = 2; } else { for (i=0; i<b.limit(); i++) writeHex(b.get(i)); } for (int j=i; j<l; j++) writeString(" "); } public static void writeGBSpace() throws IOException { out.write(0xA1); out.write(0xA1); } public static void writeString(String s) throws IOException { if (s!=null) { for (int i=0; i<s.length(); i++) { out.write((int) (s.charAt(i) & 0xFF)); } } } public static void writeNumber(int i) throws IOException { String s = "00" + String.valueOf(i); writeString(s.substring(s.length()-2,s.length())); } public static void writeHex(byte b) throws IOException { out.write((int) hexDigit[(b >> 4) & 0x0F]); out.write((int) hexDigit[b & 0x0F]); } public static void writeHeader() throws IOException { writeString("<pre>"); writeln(); writeString("Q.W. "); writeGBSpace(); writeString(" GB Uni. UTF-8 "); writeString(" "); writeString("Q.W. "); writeGBSpace(); writeString(" GB Uni. UTF-8 "); writeln(); writeln(); } public static void writeFooter() throws IOException { writeString("</pre>"); writeln(); } public static boolean validGB(int i,int j) { for (int l=0; l<b_out.length; l++) { if (i*100+j>=b_out[l] && i*100+j<=e_out[l]) return false; } return true; } }
程式輸出的例表格式如下:
Q.W. GB Uni. UTF-8 Q.W. GB Uni. UTF-8
1601 啊 B0A1 554A E5958A 1602 阿 B0A2 963F E998BF
1603 埃 B0A3 57C3 E59F83 1604 挨 B0A4 6328 E68CA8
1605 哎 B0A5 54CE E5938E 1606 唉 B0A6 5509 E59489
1607 哀 B0A7 54C0 E59380 1608 皚 B0A8 7691 E79A91
1609 癌 B0A9 764C E7998C 1610 藹 B0AA 853C E894BC
1611 矮 B0AB 77EE E79FAE 1612 艾 B0AC 827E E889BE
1613 礙 B0AD 788D E7A28D 1614 愛 B0AE 7231 E788B1
1615 隘 B0AF 9698 E99A98 1616 鞍 B0B0 978D E99E8D
1617 氨 B0B1 6C28 E6B0A8 1618 安 B0B2 5B89 E5AE89
1619 俺 B0B3 4FFA E4BFBA 1620 按 B0B4 6309 E68C89
1621 暗 B0B5 6697 E69A97 1622 岸 B0B6 5CB8 E5B2B8
1623 胺 B0B7 80FA E883BA 1624 案 B0B8 6848 E6A188
1625 骯 B0B9 80AE E882AE 1626 昂 B0BA 6602 E69882
1627 盎 B0BB 76CE E79B8E 1628 凹 B0BC 51F9 E587B9
1629 敖 B0BD 6556 E69596 1630 熬 B0BE 71AC E786AC
1631 翱 B0BF 7FF1 E7BFB1 1632 襖 B0C0 8884 E8A284
1633 傲 B0C1 50B2 E582B2 1634 奧 B0C2 5965 E5A5A5
1635 懊 B0C3 61CA E6878A 1636 澳 B0C4 6FB3 E6BEB3
。。。