Chinaunix首页 | 论坛 | 博客
  • 博客访问: 186649
  • 博文数量: 79
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 15
  • 用 户 组: 普通用户
  • 注册时间: 2016-07-25 14:40
文章分类
文章存档

2015年(1)

2014年(1)

2013年(2)

2012年(1)

2011年(5)

2010年(2)

2009年(6)

2008年(20)

2007年(27)

2006年(14)

我的朋友

分类: Python/Ruby

2007-11-29 20:59:28

    最近一直在学习和编写fat32文件系统的代码,读到长文件名时发现微软的这个长文件名支持真是要命:普通的83文件名用的是系统默认的编码,比如中文window是cp936,但长文件名,就用了unicode.于是为了处理目录文件项目,你非得做一个cp936与unicode的对照表不可。我的嵌入式系统不得不额外的增加了100多k的大小。
    下面的这个python程序我用来生成一个gb2312与unicode的汉字对照表,gb2312没有的unicode汉字我都用“口”代替了。

#-*- coding: gb2312 -*-
#GB2312-UNICODE CONVERTER
#This program generate a c header file to conver GB2312 to UNICODE,and UNICODE to gb2312.

import struct
chrgb=""
chrun=""
biggestuni=0 #record the biggest unicode
count0=0
print "//convering gk2312 to unicode"
print "U16 gk2un[6763]={"
for i in range(0xb0, 0xf8):
    if(i==0xd7):
        #print "specially"
        for j in range(0xa1,0xfa):
            chrgb=struct.pack("H",(j<<8)+i) #since pack order ,i ,j is switch
            chrun=unicode(chrgb,"gbk")
            k=ord(chrun)
            if(k>biggestuni):
                biggestuni=k
            #print " gbk:",chrgb," uni: ", chrun
            print "0x%x,"%k,
            print " //",chrgb,": 0x%x%x"%(i,j)
            count0=count0+1
        #5 blank here
        chrgb="口"
        chrun=unicode(chrgb,"gbk")
        k=ord(chrun)
        for j in range(0xfa,0xff):
            print "0x%x, //blank,"%k,": 0x%x%x"%(j,i)
    else:
        for j in range(0xa1,0xff):
            chrgb=struct.pack("H",(j<<8)+i) #since pack order ,i ,j is switch
            chrun=unicode(chrgb,"gbk")
            k=ord(chrun)
            if(k>biggestuni):
                biggestuni=k
            #print " gbk:",chrgb," uni: ", chrun
            print "0x%x,"%k,
            print " //",chrgb,": 0x%x%x"%(i,j)
            count0=count0+1
print "};"
count1=0
count2=0
#now gen the unicode to gb2312 table, no the hole unicode but include all gb2312
print "//conver unicode to gb2312"
print "U16 un2gb[20897]={"
#bigestuni=0x4e0f
for i in range(0x4e00,biggestuni+1):
    chrun=unichr(i)
    chrgb=chrun.encode("gbk")
    #print "{",
    #check if this char is in gb2312 ,if not repalce it with 0
    j=struct.unpack("H",chrgb)[0]
    l=j>>8 #the byte order is switched in gb
    h=j&0x00ff
    if((h<0xb0)or(h>0xf7)or(l<=0xa0)or(l>0xfe)):
        print "0xe0ed,", " //",chrgb,": 0x%x"%ord(chrun)," not gb2312"
    else:
        print "0x%x%x,"%(h,l),
        print " //",chrgb,": 0x%x"%ord(chrun)
        count2=count2+1
    count1=count1+1
print "};"
print "//gb2312 count=",count0," unicode count=",count1,"un2gb count=",count2

阅读(4549) | 评论(0) | 转发(0) |
0

上一篇:摘抄

下一篇:程序员《大腕》版

给主人留下些什么吧!~~