本文实例讲述了Python基于多线程实现抓取数据存入数据库的方法。分享给大家供大家参考,具体如下:
1. 数据库类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
""" 使用须知: 代码中数据表名 aces ,需要更改该数据表名称的注意更改 """ import pymysql class Database(): # 设置本地数据库用户名和密码 host = "localhost" user = "root" password = "" database = "test" port = 3306 charset = "utf8" cursor = '' connet = '' def __init__( self ): #连接到数据库 self .connet = pymysql.connect(host = self .host , user = self .user,password = self .password , database = self .database, charset = self .charset) self .cursor = self .connet.cursor() # #删表 def dropTables( self ): self .cursor.execute( '''''drop table if exists aces''' ) print ( "删表" ) #建表 def createTables( self ): self .cursor.execute( '''''create table if not exists aces ( asin varchar(11) primary key not null, checked varchar(200));''' ) print ( "建表" ) #保存数据 def save( self ,aceslist): self .cursor.execute( "insert into aces ( asin, checked) values(%s,%s)" , (aceslist[ 0 ],aceslist[ 1 ])) self .connet.commit() #判断元素是否已经在数据库里,在就返回true ,不在就返回false def is_exists_asin( self ,asin): self .cursor.execute( 'select * from aces where asin = %s' ,asin) if self .cursor.fetchone() is None : return False return True # db =Database() |
2. 多线程任务类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
import urllib.parse import urllib.parse import urllib.request from queue import Queue import time import random import threading import logging import pymysql from bs4 import BeautifulSoup from local_data import Database #一个模块中存储多个类 AmazonSpeder , ThreadCrawl(threading.Thread), AmazonSpiderJob class AmazonSpider(): def __init__( self ): self .db = Database() def randHeader( self ): head_connection = [ 'Keep-Alive' , 'close' ] head_accept = [ 'text/html, application/xhtml+xml, */*' ] head_accept_language = [ 'zh-CN,fr-FR;q=0.5' , 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3' ] head_user_agent = [ 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' , 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36' , 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)' , 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1' , 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3' , 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12' , 'Opera/9.27 (Windows NT 5.2; U; zh-cn)' , 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0' , 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)' , 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6' , 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)' , 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)' , 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ' , 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)' , 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER' , 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11' ] header = { 'Connection' : head_connection[ 0 ], 'Accept' : head_accept[ 0 ], 'Accept-Language' : head_accept_language[ 1 ], 'User-Agent' : head_user_agent[random.randrange( 0 , len (head_user_agent))] } return header def getDataById( self , queryId): #如果数据库中有的数据,直接返回不处理 if self .db.is_exists_asin(queryId): return req = urllib.request.Request(url = "https://www.amazon.com/dp/" + str (queryId) , headers = self .randHeader()) webpage = urllib.request.urlopen(req) html = webpage.read() soup = BeautifulSoup(html, 'html.parser' ) content = soup.find_all( "span" , id = "asTitle" ) # 加入一种判断,有的asin没有该定位, if len (content): # 非空 state = content[ 0 ].string else : # 列表为空,没有定位到 state = "other" print (queryId) print (state) self .db.save([queryId,state]) class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类 def __init__( self , queue): #子类特有属性, queue FORMAT = time.strftime( "[%Y-%m-%d %H:%M:%S]" , time.localtime()) + "[AmazonSpider]-----%(message)s------" logging.basicConfig(level = logging.INFO, format = FORMAT ) threading.Thread.__init__( self ) self .queue = queue self .spider = AmazonSpider() #子类特有属性spider, 并初始化,将实例用作属性 def run( self ): while True : success = True item = self .queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item try : self .spider.getDataById(item) #调用实例spider的方法getDataById(item) except : # print("失败") success = False if not success : self .queue.put(item) logging.info( "now queue size is: %d" % self .queue.qsize()) #队列对象qsize()方法,返回队列的大小 self .queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号 class AmazonSpiderJob(): def __init__( self , size , qs): self .size = size # 将形参size的值存储到属性变量size中 self .qs = qs def work( self ): toSpiderQueue = Queue() #创建一个Queue队列对象 for q in self .qs: toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item for i in range ( self .size): t = ThreadCrawl(toSpiderQueue) #将实例用到一个类的方法中 t.setDaemon( True ) t.start() toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作 |
3. 主线程类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
from amazon_s import AmazonSpiderJob #从一个模块中导入类 import pymysql import pandas as pd from local_data import Database if __name__ = = '__main__' : #初次跑程序的时候,需要删除旧表,然后新建表,之后重启再跑的时候需要注释 #---------------------- db = Database() db.dropTables() db.createTables() #--------------------------- df = pd.read_excel( "ASIN检查_viogico_1108.xlsx" ) # print(df.info()) qs = df[ "asin1" ].values print (qs) print ( len (qs)) amazonJob = AmazonSpiderJob( 8 , qs) amazonJob.work() |
希望本文所述对大家Python程序设计有所帮助。
原文链接:https://blog.csdn.net/zn505119020/article/details/78590416