本文实例讲述了Python正则表达式匹配中文用法。分享给大家供大家参考,具体如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
#!/usr/bin/python #-*- coding:cp936-*-#思路,将str转换成unicode,方可用正则表达式,前提是,要知道文件的编码,本例中是gbk import cPickle as mypickle import re import sys if (__name__ = = '__main__' ): fid1 = file ( 'demo.txt' , 'r' ); #demo.txt写入字符如:服务器之家 p = re. compile ( '(^\s+|\s+$)' ); phanzigbk = re. compile ( '[\\x20-\\x7f]' ); phanzi = re. compile (u '[\u4e00-\u9fa5]' ); #这里要加u,注意 commlines = fid1.readlines(); fid1.close(); dictfamilyname = {}; dictfirstname = {}; for line in commlines: line = p.sub('',line); print type (line); print line; uline = unicode (line, 'gbk' ); print type (uline); candidates = phanzi.findall(uline); print len (candidates); if ( len (candidates) = = 2 ): print candidates[ 0 ]; familynamegbk = candidates[ 0 ].encode( 'gbk' ); #把unicode型的变量变成str型的变量 firstnamegbk = candidates[ 1 ].encode( 'gbk' ); if (dictfamilyname.has_key(familynamegbk)): dictfamilyname[familynamegbk] = dictfamilyname[familynamegbk] + 1 ; else : dictfamilyname[familynamegbk] = 1 ; if (dictfirstname.has_key(firstnamegbk)): dictfirstname[firstnamegbk] = dictfirstname[firstnamegbk] + 1 ; else : dictfirstname[firstnamegbk] = 1 ; familynameitems = dictfamilyname.items(); print familynameitems; firstnameitems = dictfirstname.items(); familynameitems.sort(key = lambda d:d[ 1 ],reverse = True ); firstnameitems.sort(key = lambda d :d[ 1 ],reverse = True ); fid = file ( 'familyname.txt' , 'w' ); for m in familynameitems: s = m[ 0 ] + '\t' + str (m[ 1 ]); fid.write(s); fid.write( '\n' ); fid.close(); fid = file ( 'firstname.txt' , 'w' ); for m in firstnameitems: s = m[ 0 ] + '\t' + str (m[ 1 ]); fid.write(s); fid.write( '\n' ); fid.close(); print 'finish' |
运行效果图如下:
希望本文所述对大家Python程序设计有所帮助。