看代码吧~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
import re import jieba.analyse import codecs import pandas as pd def simplification_text(xianbingshi): """提取文本""" xianbingshi_simplification = [] with codecs. open (xianbingshi, 'r' , 'utf8' ) as f: for line in f : line = line.strip() line_write = re.findall( '(?<=\<b\>).*?(?=\<e\>)' ,line) for line in line_write: xianbingshi_simplification.append(line) with codecs. open (r 'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write.txt' , 'w' , 'utf8' ) as f: for line in xianbingshi_simplification: f.write(line + '\n' ) def jieba_text(): """""" word_list = [] data = open (r "C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\xianbingshi_write.txt" , encoding = 'utf-8' ).read() seg_list = jieba.cut(data, cut_all = False ) # 精确模式 for i in seg_list: word_list.append(i.strip()) data_quchong = pd.DataFrame({ 'a' :word_list}) data_quchong.drop_duplicates(subset = [ 'a' ],keep = 'first' ,inplace = True ) word_list = data_quchong[ 'a' ].tolist() with codecs. open ( 'word.txt' , 'w' , 'utf8' )as w: for line in word_list: w.write(line + '\n' ) def word_messy(word): """词语提炼""" word_sub_list = [] with codecs. open (word, 'r' , 'utf8' ) as f: for line in f: line_sub = re.sub( "^[1-9]\d*\.\d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?\d+)(\.\d+)?$|^[A-Za-z0-9]{4,40}.*?" ,'',line) word_sub_list.append(line_sub) word_sub_list.sort() with codecs. open ( 'word.txt' , 'w' , 'utf8' )as w: for line in word_sub_list: w.write(line.strip( "\n" ) + '\n' ) if __name__ = = '__main__' : xianbingshi = r 'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\xianbingshi_sub_sen_all(1).txt' # simplification_text(xianbingshi) # word = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\word.txt' simplification_text(xianbingshi) |
补充:python 进行结巴分词 并且用re去掉符号
看代码吧~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
# 把停用词做成字典 stopwords = {} fstop = open ( 'stop_words.txt' , 'r' ,encoding = 'utf-8' ,errors = 'ingnore' ) for eachWord in fstop: stopwords[eachWord.strip()] = eachWord.strip() #停用词典 fstop.close() f1 = open ( 'all.txt' , 'r' ,encoding = 'utf-8' ,errors = 'ignore' ) f2 = open ( 'allutf11.txt' , 'w' ,encoding = 'utf-8' ) line = f1.readline() while line: line = line.strip() #去前后的空格 line = re.sub(r "[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+" , " " , line) #去标点符号 seg_list = jieba.cut(line,cut_all = False ) #结巴分词 outStr = "" for word in seg_list: if word not in stopwords: outStr + = word outStr + = " " f2.write(outStr) line = f1.readline() f1.close() f2.close() |
以上为个人经验,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://www.cnblogs.com/yiwoqu/p/11542002.html