分享

python 汉字转拼音,五笔

 londonKu 2012-04-05

拼音方法一(较全)

下载库:Mandarin.dat

Python

01#coding:utf-8
02class pinyin():
03    def __init__(self, data_path='./Mandarin.dat'):
04        self.dict = {}
05        for line in open(data_path):
06            k, v = line.split('\t')
07            self.dict[k] = v
08        self.splitter = ''
09    def pinyin(self, chars):
10        result = []
11        for char in chars:
12            key = "%X" % ord(char)
13            try:
14                result.append(self.dict[key].split(" ")[0].strip()[:-1].lower())
15            except:
16                result.append(char)
17        return self.splitter.join(result)
18if __name__ == "__main__":
19    p = pinyin()
20    print p.pinyin(u"中国人的一天")

五笔与拼音

下载库:ChineseCode.dat

Python

01#coding:utf-8
02class pinyin():
03    def __init__(self, data_path='./ChineseCode.dat'):
04        self.dict = {}
05        for line in open(data_path):
06            v = line.split('\t')
07            self.dict[v[0]] = v[6]
08        self.splitter = ''
09    def wb(self, chars):
10        result = []
11        for char in chars:
12            key = "%X" % ord(char)
13            try:
14                result.append(self.dict[key]+'\n')
15            except:
16                result.append(char)
17        return self.splitter.join(result)
18    def pinyin(self, chars):
19        result = []
20        for char in chars:
21            key = "%X" % ord(char)
22            try:
23                result.append(self.dict[key].split(",")[0].strip()[:-1].lower())
24            except:
25                result.append(char)
26        return self.splitter.join(result)
27if __name__ == "__main__":
28    p = pinyin()
29    print p.get_wb(u"小强")

合并两者

Python

01#coding:utf-8
02import cPickle
03dict = {}
04i=0;
05for line in open('./Mandarin.dat'):
06    k, v = line.split('\t')
07    dict[k] = v.rstrip().lower()
08    i += 1
09    #if i > 1000: break
10i=0;
11for line in open('./ChineseCode.dat'):
12    items = line.split('\t')
13    #if dict.get(items[0]) <> None:
14        #print dict.get(items[0]), items[6]
15    key = items[0]
16    if  key in dict:
17        dict[key] = dict[key] + '\t' + items[6] + '\t' + items[2]
18        if i % 100 ==0:
19            print i
20    i += 1
21f = file('ChineseCode2.dat', 'w')
22cPickle.dump(dict, f)
23f.close()
24 
25#print dict

较完整的拼音与五笔

Python

01#coding:utf-8
02import cPickle
03class CnCode():
04    def __init__(self):
05        f = file('./ChineseCode2.dat')
06        self.dict = cPickle.load(f)
07        f.close()
08 
09    def pinyin(self, chars,splitter=' '):
10        result = []
11        for char in chars:
12            key = "%X" % ord(char)
13            try:
14                result.append(self.dict[key].split("\t")[0].split(" ")[0].strip()[:-1])
15            except:
16                result.append(char)
17        return splitter.join(result)
18 
19    def wb(self, char,splitter=','):
20        key = "%X" % ord(char)
21        try:
22            result = self.dict[key].split("\t")[1].strip()
23            result = splitter.join(result.split(" "))
24        except:
25            result = char
26        return result
27if __name__ == "__main__":
28    p = CnCode()
29    print p.pinyin(u"中国人的一天")
30    print p.wb(u"国")

为拼音加上声调

ChineseCode2.dat下载

Python

01#coding:utf-8
02import cPickle
03import re
04class CnCode():
05    '''汉字拼音与五笔及声调对照类
06 
07    @author HzqGhost admin@ QQ:313143468'''
08    def __init__(self):
09        f = file('./ChineseCode2.dat')
10        self.dict = cPickle.load(f)
11        f.close()
12        self.yunmu = ( 'ang','eng','ing','ong','an','en','in','un','ai','ei','ui','ao','ou','iu','ie','ue','er','en','a','o','e','i','u')
13        self.sheng = {'a':'ā á ǎ à','o':'ō ó ǒ ò','e':'ē é ě è','i':'ī í ǐ ì','u':'ū ú ǔ ù'}
14 
15    def pinyin(self, chars, splitter=' ', issheng=False, isFirstUpper=True):
16        '''汉字转拼音 powered by blog. (hzqghost)
17 
18        --chars [ustring] 查找对象
19        --splitter [string] 连接字符串
20        --issheng [bool] 是否返回带有声调的拼音
21        --isFirstUpper [bool] 是否首字母大写'''
22        result = []
23        for char in chars:
24            key = "%X" % ord(char)
25            py = ''
26            try:
27                if self.dict.has_key(key):
28                    py = self.dict[key].split("\t")[0].split(" ")[0].strip()
29                    if issheng:
30                        for ym in self.yunmu:
31                            if re.search(ym, py):
32                                t = py[-1:].encode('ascii','ignore')
33                                t2 = "%d" % ord(t)
34                                t3 = (int(t2) - 48)%4 -1
35                                py2 = py[:-len(ym)-1]
36                                letter = self.sheng[ym[0]].split(' ')[t3]
37                                ym = letter + ym[1:]
38                                py = py2 + ym
39                                break
40                    else:
41                        py = py[:-1]
42                    if isFirstUpper:
43                        py = py[0:1].upper()+py[1:]
44                    result.append(py)
45            except:
46                pass
47        return splitter.join(result)
48 
49    def wb(self, char,splitter=','):
50        key = "%X" % ord(char)
51        try:
52            result = self.dict[key].split("\t")[1].strip()
53            result = splitter.join(result.split(" "))
54        except:
55            result = char
56        return result
57 
58if __name__ == "__main__":
59    p = CnCode()
60    print p.pinyin(u"中国人的一天我看", ' ',True)
61    print p.wb(u"国")

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多