pre_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
#-*-coding:utf-8-*- import MySQLdb import MySQLdb as mdb import os,sys,string import jieba import codecs reload (sys) sys.setdefaultencoding( 'utf-8' ) #连接数据库 try : conn = mdb.connect(host = '127.0.0.1' ,user = 'root' ,passwd = 'kongjunli' ,db = 'test1' ,charset = 'utf8' ) except Exception,e: print e sys.exit() #获取cursor对象操作数据库 cursor = conn.cursor(mdb.cursors.DictCursor) #cursor游标 #获取内容 sql = 'SELECT link,content FROM test1.spider;' cursor.execute(sql) #execute()方法,将字符串当命令执行 data = cursor.fetchall() #fetchall()接收全部返回结果行 f = codecs. open ( 'C:\Users\kk\Desktop\hello-result1.txt' , 'w' , 'utf-8' ) for row in data: #row接收结果行的每行数据 seg = '/' .join( list (jieba.cut(row[ 'content' ],cut_all = 'False' ))) f.write(row[ 'link' ] + ' ' + seg + '\r\n' ) f.close() cursor.close() #提交事务,在插入数据时必须 |
jiansuo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
#-*-coding:utf-8-*- import sys import string import MySQLdb import MySQLdb as mdb import gensim from gensim import corpora,models,similarities from gensim.similarities import MatrixSimilarity import logging import codecs reload (sys) sys.setdefaultencoding( 'utf-8' ) con = mdb.connect(host = '127.0.0.1' ,user = 'root' ,passwd = 'kongjunli' ,db = 'test1' ,charset = 'utf8' ) with con: cur = con.cursor() cur.execute( 'SELECT * FROM cutresult_copy' ) rows = cur.fetchall() class MyCorpus( object ): def __iter__( self ): for row in rows: yield str (row[ 1 ]).split( '/' ) #开启日志 logging.basicConfig( format = '%(asctime)s:%(levelname)s:%(message)s' ,level = logging.INFO) Corp = MyCorpus() #将网页文档转化为tf-idf dictionary = corpora.Dictionary(Corp) corpus = [dictionary.doc2bow(text) for text in Corp] #将文档转化为词袋模型 #print corpus tfidf = models.TfidfModel(corpus) #使用tf-idf模型得出文档的tf-idf模型 corpus_tfidf = tfidf[corpus] #计算得出tf-idf值 #for doc in corpus_tfidf: #print doc ### ''' q_file=open('C:\Users\kk\Desktop\q.txt','r') query=q_file.readline() q_file.close() vec_bow=dictionary.doc2bow(query.split(' '))#将请求转化为词带模型 vec_tfidf=tfidf[vec_bow]#计算出请求的tf-idf值 #for t in vec_tfidf: # print t ''' ### query = raw_input ( 'Enter your query:' ) vec_bow = dictionary.doc2bow(query.split()) vec_tfidf = tfidf[vec_bow] index = similarities.MatrixSimilarity(corpus_tfidf) sims = index[vec_tfidf] similarity = list (sims) print sorted (similarity,reverse = True ) |
encodings.xml
1
2
3
4
5
6
|
<? xml version = "1.0" encoding = "UTF-8" ?> < project version = "4" > < component name = "Encoding" > < file url = "PROJECT" charset = "UTF-8" /> </ component > </ project > |
misc.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
<? xml version = "1.0" encoding = "UTF-8" ?> < project version = "4" > < component name = "ProjectLevelVcsManager" settingsEditedManually = "false" > < OptionsSetting value = "true" id = "Add" /> < OptionsSetting value = "true" id = "Remove" /> < OptionsSetting value = "true" id = "Checkout" /> < OptionsSetting value = "true" id = "Update" /> < OptionsSetting value = "true" id = "Status" /> < OptionsSetting value = "true" id = "Edit" /> < ConfirmationsSetting value = "0" id = "Add" /> < ConfirmationsSetting value = "0" id = "Remove" /> </ component > < component name = "ProjectRootManager" version = "2" project-jdk-name = "Python 2.7.11 (C:\Python27\python.exe)" project-jdk-type = "Python SDK" /> </ project > |
modules.xml
1
2
3
4
5
6
7
8
|
<? xml version = "1.0" encoding = "UTF-8" ?> < project version = "4" > < component name = "ProjectModuleManager" > < modules > < module fileurl = "file://$PROJECT_DIR$/.idea/爬虫练习代码.iml" filepath = "$PROJECT_DIR$/.idea/爬虫练习代码.iml" /> </ modules > </ component > </ project > |